use crate::ids::stable_chunk_id;
use crate::query_expansion::{expand_query_terms, normalize_for_index};
use crate::temporal::{parse_temporal_date, resolve_temporal_target};
use crate::tier1::{RankedTerm, Tier1Entity};
use anyhow::Result;
use chrono::NaiveDate;
use deunicode::deunicode;
use regex::Regex;
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet};
use std::fs;
use std::path::Path;
use std::sync::OnceLock;
use std::time::Instant;
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::document::TantivyDocument;
use tantivy::schema::Value;
use tantivy::schema::{Field, Schema, STORED, STRING, TEXT};
use tantivy::{doc, Index, IndexReader};
const LEXICAL_CONTENT_BOOST: f32 = 1.0;
const LEXICAL_HEADINGS_BOOST: f32 = 1.4;
const LEXICAL_TERMS_BOOST: f32 = 2.0;
const LEXICAL_ENTITIES_BOOST: f32 = 2.4;
const FULL_QUERY_ENTITY_WEIGHT: f32 = 1.5;
const ENTITY_TERM_WEIGHT: f32 = 1.2;
const ENTITY_PREFIX_MULTIPLIER: f32 = 0.25;
const IMPORTANT_TERM_WEIGHT: f32 = 0.8;
const IMPORTANT_TERM_PREFIX_MULTIPLIER: f32 = 0.25;
const EXPANDED_ENTITY_WEIGHT: f32 = 0.45;
const EXPANDED_TERM_WEIGHT: f32 = 0.35;
const TOPIC_OVERLAP_WEIGHT: f32 = 0.35;
const DOC_TYPE_OVERLAP_WEIGHT: f32 = 0.25;
const TIMESTAMP_PRESENCE_WEIGHT: f32 = 0.05;
const DOC_LINK_GRAPH_WEIGHT: f32 = 0.22;
const GRAPH_MAX_BOOST: f32 = 0.6;
const ENTITY_GRAPH_WEIGHT: f32 = 0.08;
const ENTITY_GRAPH_MAX_CANDIDATES: usize = 100;
const FINAL_RERANK_WINDOW: usize = 200;
const MAX_RESULTS_PER_GROUP: usize = 2;
const TEXT_RERANK_WEIGHT: f32 = 0.08;
const TEXT_RERANK_NGRAM_WEIGHT: f32 = 0.08;
const TEXT_RERANK_LCS_WEIGHT: f32 = 0.05;
const TEXT_RERANK_NGRAM_SIZE: usize = 2;
const TEXT_RERANK_WINDOW: usize = 30;
const TEXT_RERANK_CONTENT_CHARS: usize = 4096;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Provenance {
pub source: String,
pub timestamp: Option<String>,
pub ner_provider: String,
pub term_ranker: String,
pub index_version: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Claim {
pub subject: String,
pub predicate: String,
pub object: String,
pub confidence: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SectionChunk {
pub chunk_id: String,
pub heading: String,
pub content: String,
#[serde(default)]
pub start_line: usize,
#[serde(default)]
pub end_line: usize,
#[serde(default)]
pub timestamp: Option<String>,
pub key_entities: Vec<String>,
pub important_terms: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DocRecord {
pub doc_id: String,
pub source: String,
#[serde(skip_serializing)]
pub content: String,
pub timestamp: Option<String>,
pub doc_length: usize,
pub author_agent: Option<String>,
pub group_id: Option<String>,
pub probable_topic: Option<String>,
pub doc_type_guess: Option<String>,
pub headings: Vec<String>,
#[serde(default)]
pub doc_links: Vec<String>,
#[serde(default)]
pub temporal_terms: Vec<String>,
pub key_entities: Vec<Tier1Entity>,
pub important_terms: Vec<RankedTerm>,
pub section_chunks: Vec<SectionChunk>,
pub embedding: Option<Vec<f32>>,
pub top_claims: Vec<Claim>,
pub provenance: Provenance,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EntityPosting {
pub doc_id: String,
pub score: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TermPosting {
pub doc_id: String,
pub score: f32,
}
#[derive(Debug, Clone, Default)]
#[allow(dead_code)]
pub(crate) struct SemanticChunkState {
pub chunk_id: String,
pub doc_id: String,
pub heading: String,
pub start_line: usize,
pub end_line: usize,
pub key_entities: Vec<String>,
pub important_terms: Vec<String>,
}
#[derive(Debug, Clone, Default)]
pub(crate) struct SemanticDocState {
pub doc_id: String,
pub chunk_ids: Vec<String>,
pub chunks: HashMap<String, SemanticChunkState>,
pub entity_to_docs: HashMap<String, Vec<EntityPosting>>,
pub term_to_docs: HashMap<String, Vec<TermPosting>>,
pub claim_to_docs: HashMap<String, Vec<TermPosting>>,
pub topic: Option<String>,
pub doc_type: Option<String>,
}
#[derive(Debug, Clone, Default)]
pub(crate) struct SemanticAggregate {
pub chunk_to_doc: HashMap<String, String>,
pub doc_to_chunks: HashMap<String, Vec<String>>,
pub chunk_ranges: HashMap<String, (usize, usize)>,
pub term_to_chunks: HashMap<String, Vec<(String, f32)>>,
pub entity_to_chunks: HashMap<String, Vec<(String, f32)>>,
pub entity_to_docs: HashMap<String, Vec<EntityPosting>>,
pub term_to_docs: HashMap<String, Vec<TermPosting>>,
pub claim_to_docs: HashMap<String, Vec<TermPosting>>,
pub topic_to_docs: HashMap<String, Vec<String>>,
pub doc_type_to_docs: HashMap<String, Vec<String>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct ChunkMeta {
doc_u32: u32,
chunk_id: String,
start_line: usize,
end_line: usize,
}
#[derive(Debug, Default, Clone)]
struct TrieNode {
children: HashMap<char, usize>,
value: Option<u32>,
}
#[derive(Debug, Default, Clone)]
struct LexiconTrie {
nodes: Vec<TrieNode>,
}
impl LexiconTrie {
fn new() -> Self {
Self {
nodes: vec![TrieNode::default()],
}
}
fn insert(&mut self, key: &str, value: u32) {
let mut cur = 0usize;
for ch in key.chars() {
let next = if let Some(idx) = self.nodes[cur].children.get(&ch) {
*idx
} else {
let idx = self.nodes.len();
self.nodes.push(TrieNode::default());
self.nodes[cur].children.insert(ch, idx);
idx
};
cur = next;
}
self.nodes[cur].value = Some(value);
}
fn get(&self, key: &str) -> Option<u32> {
let mut cur = 0usize;
for ch in key.chars() {
cur = *self.nodes.get(cur)?.children.get(&ch)?;
}
self.nodes.get(cur)?.value
}
fn prefix_ids(&self, prefix: &str, limit: usize) -> Vec<u32> {
let mut cur = 0usize;
for ch in prefix.chars() {
let Some(next) = self
.nodes
.get(cur)
.and_then(|n| n.children.get(&ch))
.copied()
else {
return Vec::new();
};
cur = next;
}
let mut out = Vec::new();
let mut stack = vec![cur];
while let Some(idx) = stack.pop() {
if let Some(v) = self.nodes[idx].value {
out.push(v);
if out.len() >= limit {
break;
}
}
for next in self.nodes[idx].children.values() {
stack.push(*next);
}
}
out
}
}
#[derive(Debug, Clone)]
struct IntervalEntry {
start: usize,
end: usize,
chunk_u32: u32,
}
#[derive(Debug, Clone)]
struct IntervalNode {
center: usize,
overlaps: Vec<IntervalEntry>,
left: Option<Box<IntervalNode>>,
right: Option<Box<IntervalNode>>,
}
#[derive(Debug, Clone, Default)]
struct IntervalTree {
root: Option<Box<IntervalNode>>,
}
impl IntervalTree {
fn build(entries: Vec<IntervalEntry>) -> Self {
fn build_rec(mut entries: Vec<IntervalEntry>) -> Option<Box<IntervalNode>> {
if entries.is_empty() {
return None;
}
let mut points = entries
.iter()
.map(|e| e.start + (e.end.saturating_sub(e.start) / 2))
.collect::<Vec<_>>();
points.sort_unstable();
let center = points[points.len() / 2];
let mut left = Vec::new();
let mut right = Vec::new();
let mut overlaps = Vec::new();
for e in entries.drain(..) {
if e.end < center {
left.push(e);
} else if e.start > center {
right.push(e);
} else {
overlaps.push(e);
}
}
Some(Box::new(IntervalNode {
center,
overlaps,
left: build_rec(left),
right: build_rec(right),
}))
}
Self {
root: build_rec(entries),
}
}
fn query(&self, start: usize, end: usize) -> Vec<u32> {
fn walk(node: &Option<Box<IntervalNode>>, start: usize, end: usize, out: &mut Vec<u32>) {
let Some(node) = node else {
return;
};
for e in &node.overlaps {
if e.start <= end && start <= e.end {
out.push(e.chunk_u32);
}
}
if start <= node.center {
walk(&node.left, start, end, out);
}
if end >= node.center {
walk(&node.right, start, end, out);
}
}
let mut out = Vec::new();
walk(&self.root, start, end, &mut out);
out
}
}
#[derive(Serialize)]
pub struct MemoryIndex {
pub docs: HashMap<String, DocRecord>,
pub entity_to_docs: HashMap<String, Vec<EntityPosting>>,
pub term_to_docs: HashMap<String, Vec<TermPosting>>,
pub claim_to_docs: HashMap<String, Vec<TermPosting>>,
pub topic_to_docs: HashMap<String, Vec<String>>,
pub doc_type_to_docs: HashMap<String, Vec<String>>,
#[serde(skip_serializing)]
lexical: Option<LexicalIndex>,
#[serde(skip_serializing)]
#[allow(dead_code)]
doc_id_to_u32: HashMap<String, u32>,
#[serde(skip_serializing)]
doc_u32_to_id: Vec<String>,
#[serde(skip_serializing)]
#[allow(dead_code)]
chunk_id_to_u32: HashMap<String, u32>,
#[serde(skip_serializing)]
chunks: Vec<ChunkMeta>,
#[serde(skip_serializing)]
#[allow(dead_code)]
doc_to_chunks: Vec<Vec<u32>>,
#[serde(skip_serializing)]
#[allow(dead_code)]
term_lexicon: HashMap<String, u32>,
#[serde(skip_serializing)]
#[allow(dead_code)]
entity_lexicon: HashMap<String, u32>,
#[serde(skip_serializing)]
term_postings_chunk: Vec<Vec<(u32, f32)>>,
#[serde(skip_serializing)]
entity_postings_chunk: Vec<Vec<(u32, f32)>>,
#[serde(skip_serializing)]
#[allow(dead_code)]
chunk_terms: Vec<Vec<u32>>,
#[serde(skip_serializing)]
#[allow(dead_code)]
chunk_entities: Vec<Vec<u32>>,
#[serde(skip_serializing)]
term_trie: LexiconTrie,
#[serde(skip_serializing)]
entity_trie: LexiconTrie,
#[serde(skip_serializing)]
doc_interval_trees: Vec<IntervalTree>,
#[serde(skip_serializing)]
doc_key_entities: Vec<Vec<String>>,
#[serde(skip_serializing)]
claim_scoring: bool,
#[serde(skip_serializing)]
text_rerank_ngram: bool,
#[serde(skip_serializing)]
text_rerank_lcs: bool,
}
struct LexicalIndex {
index: Index,
reader: IndexReader,
doc_id_f: Field,
content_f: Field,
headings_f: Field,
terms_f: Field,
entities_f: Field,
temporal_f: Field,
}
#[derive(Debug, Clone, Serialize, Default)]
pub struct ScoreBreakdown {
pub lexical_score: f32,
pub entity_score: f32,
pub term_score: f32,
pub claim_score: f32,
pub topic_score: f32,
pub doc_type_score: f32,
pub recency_score: f32,
pub graph_link_score: f32,
pub entity_graph_score: f32,
pub sequence_rerank_score: f32,
}
#[derive(Debug, Clone, Serialize)]
pub struct SearchResult {
pub doc_id: String,
pub source: String,
pub group_id: Option<String>,
pub score: f32,
pub score_breakdown: ScoreBreakdown,
pub matched_entities: Vec<String>,
pub matched_terms: Vec<String>,
pub probable_topic: Option<String>,
pub doc_type_guess: Option<String>,
}
#[derive(Debug, Clone, Default, Serialize)]
pub struct QueryTimings {
pub total_ms: f64,
pub refresh_ms: f64,
pub lexical_bm25_ms: f64,
pub snapshot_query_ms: f64,
pub rerank_ms: f64,
pub parse_ms: f64,
pub sparse_scoring_ms: f64,
pub candidate_accumulation_ms: f64,
pub candidate_rank_ms: f64,
pub metadata_ms: f64,
pub graph_ms: f64,
pub entity_graph_ms: f64,
pub sequence_rerank_ms: f64,
pub ranking_ms: f64,
}
#[derive(Debug, Clone, Default, Serialize)]
pub struct QueryDiagnostics {
pub query_terms: usize,
pub expanded_terms: usize,
pub lexical_hits: usize,
pub candidates: usize,
}
#[derive(Debug, Clone)]
pub struct TemporalQueryContext<'a> {
pub starts_from: Option<&'a str>,
pub ends_at: Option<&'a str>,
pub window_days: i64,
pub hard_filter: bool,
}
impl<'a> Default for TemporalQueryContext<'a> {
fn default() -> Self {
Self {
starts_from: None,
ends_at: None,
window_days: 7,
hard_filter: false,
}
}
}
#[derive(Debug, Clone, Default)]
struct CandidateState {
score: f32,
breakdown: ScoreBreakdown,
matched_entities: Vec<String>,
matched_terms: Vec<String>,
}
#[derive(Debug, Clone, Serialize)]
pub struct RedactedDocRecord {
pub doc_id: String,
pub source: String,
pub timestamp: Option<String>,
pub doc_length: usize,
pub group_id: Option<String>,
pub probable_topic: Option<String>,
pub doc_type_guess: Option<String>,
pub provenance: Provenance,
}
#[derive(Debug, Clone, Serialize)]
pub struct RedactedMemoryIndex {
pub docs: HashMap<String, RedactedDocRecord>,
pub topic_to_docs: HashMap<String, Vec<String>>,
pub doc_type_to_docs: HashMap<String, Vec<String>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct PersistedMemoryCore {
entity_to_docs: HashMap<String, Vec<EntityPosting>>,
term_to_docs: HashMap<String, Vec<TermPosting>>,
#[serde(default)]
claim_to_docs: HashMap<String, Vec<TermPosting>>,
topic_to_docs: HashMap<String, Vec<String>>,
doc_type_to_docs: HashMap<String, Vec<String>>,
doc_id_to_u32: HashMap<String, u32>,
doc_u32_to_id: Vec<String>,
chunk_id_to_u32: HashMap<String, u32>,
chunks: Vec<ChunkMeta>,
doc_to_chunks: Vec<Vec<u32>>,
term_lexicon: HashMap<String, u32>,
entity_lexicon: HashMap<String, u32>,
term_postings_chunk: Vec<Vec<(u32, f32)>>,
entity_postings_chunk: Vec<Vec<(u32, f32)>>,
chunk_terms: Vec<Vec<u32>>,
chunk_entities: Vec<Vec<u32>>,
}
impl MemoryIndex {
pub fn from_records(records: Vec<DocRecord>) -> Self {
Self::from_records_with_lexical_dir(records, None, false, false, false)
}
pub fn from_records_with_lexical_dir(
records: Vec<DocRecord>,
lexical_dir: Option<&Path>,
text_rerank_ngram: bool,
text_rerank_lcs: bool,
claim_scoring: bool,
) -> Self {
Self::from_records_internal(
records,
lexical_dir,
None,
text_rerank_ngram,
text_rerank_lcs,
claim_scoring,
)
}
pub(crate) fn from_records_with_semantic_aggregate(
records: Vec<DocRecord>,
semantic_aggregate: SemanticAggregate,
text_rerank_ngram: bool,
text_rerank_lcs: bool,
claim_scoring: bool,
) -> Self {
Self::from_records_internal(
records,
None,
Some(semantic_aggregate),
text_rerank_ngram,
text_rerank_lcs,
claim_scoring,
)
}
fn from_records_internal(
records: Vec<DocRecord>,
lexical_dir: Option<&Path>,
semantic_aggregate: Option<SemanticAggregate>,
text_rerank_ngram: bool,
text_rerank_lcs: bool,
claim_scoring: bool,
) -> Self {
let mut docs = HashMap::new();
let mut entity_to_docs: HashMap<String, Vec<EntityPosting>> = HashMap::new();
let mut term_to_docs: HashMap<String, Vec<TermPosting>> = HashMap::new();
let mut claim_to_docs: HashMap<String, Vec<TermPosting>> = HashMap::new();
let mut topic_to_docs: HashMap<String, Vec<String>> = HashMap::new();
let mut doc_type_to_docs: HashMap<String, Vec<String>> = HashMap::new();
let mut doc_id_to_u32: HashMap<String, u32> = HashMap::new();
let mut doc_u32_to_id: Vec<String> = Vec::new();
let mut chunk_id_to_u32: HashMap<String, u32> = HashMap::new();
let mut chunks: Vec<ChunkMeta> = Vec::new();
let mut doc_to_chunks: Vec<Vec<u32>> = Vec::new();
let mut term_lexicon: HashMap<String, u32> = HashMap::new();
let mut entity_lexicon: HashMap<String, u32> = HashMap::new();
let mut term_postings_chunk: Vec<Vec<(u32, f32)>> = Vec::new();
let mut entity_postings_chunk: Vec<Vec<(u32, f32)>> = Vec::new();
let mut chunk_terms: Vec<Vec<u32>> = Vec::new();
let mut chunk_entities: Vec<Vec<u32>> = Vec::new();
let mut doc_key_entities: Vec<Vec<String>> = Vec::new();
for mut record in records {
let doc_id = record.doc_id.clone();
let doc_u32 = doc_u32_to_id.len() as u32;
doc_id_to_u32.insert(doc_id.clone(), doc_u32);
doc_u32_to_id.push(doc_id.clone());
doc_to_chunks.push(Vec::new());
doc_key_entities.push(normalized_entity_keys(&record.key_entities));
if record.section_chunks.is_empty() {
record.section_chunks.push(SectionChunk {
chunk_id: stable_chunk_id(
&doc_id,
record
.headings
.first()
.map(String::as_str)
.unwrap_or("(document)"),
&record.content,
1,
record.content.lines().count().max(1),
),
heading: record
.headings
.first()
.cloned()
.unwrap_or_else(|| "(document)".to_string()),
content: record.content.clone(),
start_line: 1,
end_line: record.content.lines().count().max(1),
timestamp: record.timestamp.clone(),
key_entities: record
.key_entities
.iter()
.map(|e| normalize_for_index(&e.text))
.filter(|v| !v.is_empty())
.collect(),
important_terms: record
.important_terms
.iter()
.map(|t| normalize_for_index(&t.term))
.filter(|v| !v.is_empty())
.collect(),
});
}
if semantic_aggregate.is_none() {
for chunk in &record.section_chunks {
let chunk_u32 = chunks.len() as u32;
chunk_id_to_u32.insert(chunk.chunk_id.clone(), chunk_u32);
chunks.push(ChunkMeta {
doc_u32,
chunk_id: chunk.chunk_id.clone(),
start_line: chunk.start_line,
end_line: chunk.end_line,
});
doc_to_chunks[doc_u32 as usize].push(chunk_u32);
chunk_terms.push(Vec::new());
chunk_entities.push(Vec::new());
for term in &chunk.important_terms {
let key = normalize_for_index(term);
if key.is_empty() {
continue;
}
let term_u32 = *term_lexicon.entry(key).or_insert_with(|| {
term_postings_chunk.push(Vec::new());
(term_postings_chunk.len() - 1) as u32
});
term_postings_chunk[term_u32 as usize].push((chunk_u32, 0.8));
chunk_terms[chunk_u32 as usize].push(term_u32);
}
let heading_tokens = tokenize_query_terms(&chunk.heading);
for token in heading_tokens {
let term_u32 = *term_lexicon.entry(token).or_insert_with(|| {
term_postings_chunk.push(Vec::new());
(term_postings_chunk.len() - 1) as u32
});
term_postings_chunk[term_u32 as usize].push((chunk_u32, 0.4));
chunk_terms[chunk_u32 as usize].push(term_u32);
}
for entity in &chunk.key_entities {
let key = normalize_for_index(entity);
if key.is_empty() {
continue;
}
let entity_u32 = *entity_lexicon.entry(key).or_insert_with(|| {
entity_postings_chunk.push(Vec::new());
(entity_postings_chunk.len() - 1) as u32
});
entity_postings_chunk[entity_u32 as usize].push((chunk_u32, 0.9));
chunk_entities[chunk_u32 as usize].push(entity_u32);
}
}
}
if semantic_aggregate.is_none() {
for entity in &record.key_entities {
let key = normalize_for_index(&entity.text);
if key.is_empty() {
continue;
}
let score = entity.score.unwrap_or(0.5);
entity_to_docs.entry(key).or_default().push(EntityPosting {
doc_id: doc_id.clone(),
score,
});
}
if let Some(topic) = record.probable_topic.as_ref() {
topic_to_docs
.entry(topic.to_lowercase())
.or_default()
.push(doc_id.clone());
}
if let Some(doc_type) = record.doc_type_guess.as_ref() {
doc_type_to_docs
.entry(doc_type.to_lowercase())
.or_default()
.push(doc_id.clone());
}
}
if claim_scoring {
for claim in &record.top_claims {
for token in claim_tokens(claim) {
claim_to_docs
.entry(token)
.or_default()
.push(TermPosting {
doc_id: doc_id.clone(),
score: claim.confidence.max(0.1),
});
}
}
}
docs.insert(doc_id, record);
}
let mut term_u32_to_key: Vec<String> = vec![String::new(); term_lexicon.len()];
for (k, &v) in &term_lexicon {
term_u32_to_key[v as usize] = k.clone();
}
let mut entity_u32_to_key: Vec<String> = vec![String::new(); entity_lexicon.len()];
for (k, &v) in &entity_lexicon {
entity_u32_to_key[v as usize] = k.clone();
}
if let Some(semantic_aggregate) = semantic_aggregate {
for (doc_u32, doc_id) in doc_u32_to_id.iter().enumerate() {
let mut doc_chunk_ids = semantic_aggregate
.doc_to_chunks
.get(doc_id)
.cloned()
.unwrap_or_default();
doc_chunk_ids.sort();
for chunk_id in doc_chunk_ids {
let Some((start_line, end_line)) =
semantic_aggregate.chunk_ranges.get(&chunk_id).copied()
else {
continue;
};
let chunk_u32 = chunks.len() as u32;
chunk_id_to_u32.insert(chunk_id.clone(), chunk_u32);
chunks.push(ChunkMeta {
doc_u32: doc_u32 as u32,
chunk_id: chunk_id.clone(),
start_line,
end_line,
});
doc_to_chunks[doc_u32].push(chunk_u32);
chunk_terms.push(Vec::new());
chunk_entities.push(Vec::new());
}
}
for (key, postings) in &semantic_aggregate.term_to_chunks {
let term_u32 = *term_lexicon.entry(key.clone()).or_insert_with(|| {
term_postings_chunk.push(Vec::new());
(term_postings_chunk.len() - 1) as u32
});
for (chunk_id, score) in postings {
let Some(&chunk_u32) = chunk_id_to_u32.get(chunk_id) else {
continue;
};
term_postings_chunk[term_u32 as usize].push((chunk_u32, *score));
chunk_terms[chunk_u32 as usize].push(term_u32);
}
}
for (key, postings) in &semantic_aggregate.entity_to_chunks {
let entity_u32 = *entity_lexicon.entry(key.clone()).or_insert_with(|| {
entity_postings_chunk.push(Vec::new());
(entity_postings_chunk.len() - 1) as u32
});
for (chunk_id, score) in postings {
let Some(&chunk_u32) = chunk_id_to_u32.get(chunk_id) else {
continue;
};
entity_postings_chunk[entity_u32 as usize].push((chunk_u32, *score));
chunk_entities[chunk_u32 as usize].push(entity_u32);
}
}
entity_to_docs = semantic_aggregate.entity_to_docs;
term_to_docs = semantic_aggregate.term_to_docs;
if claim_scoring {
claim_to_docs = semantic_aggregate.claim_to_docs;
}
topic_to_docs = semantic_aggregate.topic_to_docs;
doc_type_to_docs = semantic_aggregate.doc_type_to_docs;
} else {
term_to_docs.clear();
entity_to_docs.clear();
if !claim_scoring {
claim_to_docs.clear();
}
for (term_u32, postings) in term_postings_chunk.iter().enumerate() {
let key = &term_u32_to_key[term_u32];
for (chunk_u32, score) in postings {
let doc_u32 = chunks[*chunk_u32 as usize].doc_u32;
let doc_id = &doc_u32_to_id[doc_u32 as usize];
term_to_docs
.entry(key.clone())
.or_default()
.push(TermPosting {
doc_id: doc_id.clone(),
score: *score,
});
}
}
for (entity_u32, postings) in entity_postings_chunk.iter().enumerate() {
let key = &entity_u32_to_key[entity_u32];
for (chunk_u32, score) in postings {
let doc_u32 = chunks[*chunk_u32 as usize].doc_u32;
let doc_id = &doc_u32_to_id[doc_u32 as usize];
entity_to_docs
.entry(key.clone())
.or_default()
.push(EntityPosting {
doc_id: doc_id.clone(),
score: *score,
});
}
}
}
if claim_scoring {
for postings in claim_to_docs.values_mut() {
postings.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
}
}
let (term_trie, entity_trie, doc_interval_trees) = Self::build_runtime_helpers(
&term_lexicon,
&entity_lexicon,
&chunks,
doc_u32_to_id.len(),
);
for postings in entity_to_docs.values_mut() {
postings.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
}
for postings in term_to_docs.values_mut() {
postings.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
}
let lexical = match Self::build_lexical_index(&docs, lexical_dir) {
Ok(index) => Some(index),
Err(err) => {
eprintln!("warning: lexical BM25 index disabled: {}", err);
None
}
};
Self {
docs,
entity_to_docs,
term_to_docs,
claim_to_docs,
topic_to_docs,
doc_type_to_docs,
lexical,
doc_id_to_u32,
doc_u32_to_id,
chunk_id_to_u32,
chunks,
doc_to_chunks,
term_lexicon,
entity_lexicon,
term_postings_chunk,
entity_postings_chunk,
chunk_terms,
chunk_entities,
term_trie,
entity_trie,
doc_interval_trees,
doc_key_entities,
claim_scoring,
text_rerank_ngram,
text_rerank_lcs,
}
}
fn build_runtime_helpers(
term_lexicon: &HashMap<String, u32>,
entity_lexicon: &HashMap<String, u32>,
chunks: &[ChunkMeta],
doc_count: usize,
) -> (LexiconTrie, LexiconTrie, Vec<IntervalTree>) {
let mut term_trie = LexiconTrie::new();
for (key, &id) in term_lexicon {
term_trie.insert(key, id);
}
let mut entity_trie = LexiconTrie::new();
for (key, &id) in entity_lexicon {
entity_trie.insert(key, id);
}
let mut interval_entries_by_doc: Vec<Vec<IntervalEntry>> = vec![Vec::new(); doc_count];
for (chunk_u32, meta) in chunks.iter().enumerate() {
if meta.end_line >= meta.start_line
&& meta.end_line > 0
&& (meta.doc_u32 as usize) < doc_count
{
interval_entries_by_doc[meta.doc_u32 as usize].push(IntervalEntry {
start: meta.start_line.max(1),
end: meta.end_line,
chunk_u32: chunk_u32 as u32,
});
}
}
let trees = interval_entries_by_doc
.into_iter()
.map(IntervalTree::build)
.collect::<Vec<_>>();
(term_trie, entity_trie, trees)
}
pub fn load_with_binary_core(
records: Vec<DocRecord>,
core_path: &Path,
lexical_dir: Option<&Path>,
claim_scoring: bool,
) -> Result<Self> {
let bytes = fs::read(core_path)?;
let core: PersistedMemoryCore = bincode::deserialize(&bytes)?;
if core.doc_u32_to_id.is_empty() && !records.is_empty() {
anyhow::bail!("invalid binary core: empty doc table");
}
let mut docs = HashMap::new();
for mut record in records {
if record.section_chunks.is_empty() {
record.section_chunks.push(SectionChunk {
chunk_id: stable_chunk_id(
&record.doc_id,
record
.headings
.first()
.map(String::as_str)
.unwrap_or("(document)"),
&record.content,
1,
record.content.lines().count().max(1),
),
heading: record
.headings
.first()
.cloned()
.unwrap_or_else(|| "(document)".to_string()),
content: record.content.clone(),
start_line: 1,
end_line: record.content.lines().count().max(1),
timestamp: record.timestamp.clone(),
key_entities: record
.key_entities
.iter()
.map(|e| normalize_for_index(&e.text))
.filter(|v| !v.is_empty())
.collect(),
important_terms: record
.important_terms
.iter()
.map(|t| normalize_for_index(&t.term))
.filter(|v| !v.is_empty())
.collect(),
});
}
docs.insert(record.doc_id.clone(), record);
}
let lexical = match Self::build_lexical_index(&docs, lexical_dir) {
Ok(index) => Some(index),
Err(err) => {
eprintln!("warning: lexical BM25 index disabled: {}", err);
None
}
};
let (term_trie, entity_trie, doc_interval_trees) = Self::build_runtime_helpers(
&core.term_lexicon,
&core.entity_lexicon,
&core.chunks,
core.doc_u32_to_id.len(),
);
let mut doc_key_entities = vec![Vec::new(); core.doc_u32_to_id.len()];
for (doc_id, record) in &docs {
if let Some(&doc_u32) = core.doc_id_to_u32.get(doc_id) {
doc_key_entities[doc_u32 as usize] = normalized_entity_keys(&record.key_entities);
}
}
Ok(Self {
docs,
entity_to_docs: core.entity_to_docs,
term_to_docs: core.term_to_docs,
claim_to_docs: if claim_scoring {
core.claim_to_docs
} else {
HashMap::new()
},
topic_to_docs: core.topic_to_docs,
doc_type_to_docs: core.doc_type_to_docs,
lexical,
doc_id_to_u32: core.doc_id_to_u32,
doc_u32_to_id: core.doc_u32_to_id,
chunk_id_to_u32: core.chunk_id_to_u32,
chunks: core.chunks,
doc_to_chunks: core.doc_to_chunks,
term_lexicon: core.term_lexicon,
entity_lexicon: core.entity_lexicon,
term_postings_chunk: core.term_postings_chunk,
entity_postings_chunk: core.entity_postings_chunk,
chunk_terms: core.chunk_terms,
chunk_entities: core.chunk_entities,
term_trie,
entity_trie,
doc_interval_trees,
doc_key_entities,
claim_scoring,
text_rerank_ngram: false,
text_rerank_lcs: false,
})
}
pub fn save_binary_core(&self, core_path: &Path) -> Result<()> {
if let Some(parent) = core_path.parent() {
fs::create_dir_all(parent)?;
}
let core = PersistedMemoryCore {
entity_to_docs: self.entity_to_docs.clone(),
term_to_docs: self.term_to_docs.clone(),
claim_to_docs: self.claim_to_docs.clone(),
topic_to_docs: self.topic_to_docs.clone(),
doc_type_to_docs: self.doc_type_to_docs.clone(),
doc_id_to_u32: self.doc_id_to_u32.clone(),
doc_u32_to_id: self.doc_u32_to_id.clone(),
chunk_id_to_u32: self.chunk_id_to_u32.clone(),
chunks: self.chunks.clone(),
doc_to_chunks: self.doc_to_chunks.clone(),
term_lexicon: self.term_lexicon.clone(),
entity_lexicon: self.entity_lexicon.clone(),
term_postings_chunk: self.term_postings_chunk.clone(),
entity_postings_chunk: self.entity_postings_chunk.clone(),
chunk_terms: self.chunk_terms.clone(),
chunk_entities: self.chunk_entities.clone(),
};
fs::write(core_path, bincode::serialize(&core)?)?;
Ok(())
}
fn build_lexical_index(
docs: &HashMap<String, DocRecord>,
lexical_dir: Option<&Path>,
) -> Result<LexicalIndex> {
let mut schema_builder = Schema::builder();
let doc_id_f = schema_builder.add_text_field("doc_id", STRING | STORED);
let content_f = schema_builder.add_text_field("content", TEXT);
let headings_f = schema_builder.add_text_field("headings", TEXT);
let terms_f = schema_builder.add_text_field("important_terms", TEXT);
let entities_f = schema_builder.add_text_field("entities", TEXT);
let temporal_f = schema_builder.add_text_field("temporal_terms", TEXT);
let schema = schema_builder.build();
let index = if let Some(dir) = lexical_dir {
fs::create_dir_all(dir)?;
match Index::open_in_dir(dir) {
Ok(existing) => existing,
Err(_) => {
if dir.exists() {
let _ = fs::remove_dir_all(dir);
fs::create_dir_all(dir)?;
}
let created = Index::create_in_dir(dir, schema.clone())?;
let mut writer = created.writer(50_000_000)?;
for doc in docs.values() {
let headings_text = doc
.section_chunks
.iter()
.map(|c| c.heading.as_str())
.collect::<Vec<_>>()
.join(" ");
let terms_text = doc
.section_chunks
.iter()
.flat_map(|c| c.important_terms.iter().map(String::as_str))
.collect::<Vec<_>>()
.join(" ");
let entities_text = doc
.section_chunks
.iter()
.flat_map(|c| c.key_entities.iter().map(String::as_str))
.collect::<Vec<_>>()
.join(" ");
let content_text = doc
.section_chunks
.iter()
.map(|c| c.content.as_str())
.collect::<Vec<_>>()
.join("\n");
let temporal_text = doc.temporal_terms.join(" ");
writer.add_document(doc!(
doc_id_f => doc.doc_id.clone(),
content_f => content_text,
headings_f => headings_text,
terms_f => terms_text,
entities_f => entities_text,
temporal_f => temporal_text
))?;
}
writer.commit()?;
created
}
}
} else {
let ram = Index::create_in_ram(schema);
let mut writer = ram.writer(50_000_000)?;
for doc in docs.values() {
let headings_text = doc
.section_chunks
.iter()
.map(|c| c.heading.as_str())
.collect::<Vec<_>>()
.join(" ");
let terms_text = doc
.section_chunks
.iter()
.flat_map(|c| c.important_terms.iter().map(String::as_str))
.collect::<Vec<_>>()
.join(" ");
let entities_text = doc
.section_chunks
.iter()
.flat_map(|c| c.key_entities.iter().map(String::as_str))
.collect::<Vec<_>>()
.join(" ");
let content_text = doc
.section_chunks
.iter()
.map(|c| c.content.as_str())
.collect::<Vec<_>>()
.join("\n");
let temporal_text = doc.temporal_terms.join(" ");
writer.add_document(doc!(
doc_id_f => doc.doc_id.clone(),
content_f => content_text,
headings_f => headings_text,
terms_f => terms_text,
entities_f => entities_text,
temporal_f => temporal_text
))?;
}
writer.commit()?;
ram
};
let reader = index.reader()?;
let schema_ref = index.schema();
let doc_id_f = schema_ref.get_field("doc_id")?;
let content_f = schema_ref.get_field("content")?;
let headings_f = schema_ref.get_field("headings")?;
let terms_f = schema_ref.get_field("important_terms")?;
let entities_f = schema_ref.get_field("entities")?;
let temporal_f = schema_ref.get_field("temporal_terms")?;
Ok(LexicalIndex {
index,
reader,
doc_id_f,
content_f,
headings_f,
terms_f,
entities_f,
temporal_f,
})
}
fn lexical_bm25(&self, query: &str, top_k: usize) -> Result<HashMap<String, f32>> {
let Some(lex) = self.lexical.as_ref() else {
return Ok(HashMap::new());
};
let searcher = lex.reader.searcher();
let mut query_parser = QueryParser::for_index(
&lex.index,
vec![
lex.content_f,
lex.headings_f,
lex.terms_f,
lex.entities_f,
lex.temporal_f,
],
);
query_parser.set_field_boost(lex.content_f, LEXICAL_CONTENT_BOOST);
query_parser.set_field_boost(lex.headings_f, LEXICAL_HEADINGS_BOOST);
query_parser.set_field_boost(lex.terms_f, LEXICAL_TERMS_BOOST);
query_parser.set_field_boost(lex.entities_f, LEXICAL_ENTITIES_BOOST);
query_parser.set_field_boost(lex.temporal_f, 1.1);
let parsed = match query_parser.parse_query(query) {
Ok(parsed) => parsed,
Err(first_err) => {
let fallback_query = sanitize_bm25_query(query);
if fallback_query.is_empty() {
return Err(first_err.into());
}
match query_parser.parse_query(&fallback_query) {
Ok(parsed) => parsed,
Err(_) => return Err(first_err.into()),
}
}
};
let top_docs = searcher.search(&parsed, &TopDocs::with_limit(top_k))?;
let mut out = HashMap::new();
for (score, addr) in top_docs {
let retrieved: TantivyDocument = searcher.doc(addr)?;
if let Some(v) = retrieved.get_first(lex.doc_id_f) {
if let Some(doc_id) = v.as_str() {
out.insert(doc_id.to_string(), score);
}
}
}
Ok(out)
}
pub fn query(&self, query: &str, top_k: usize) -> Vec<SearchResult> {
self.query_with_temporal_context(query, top_k, TemporalQueryContext::default())
.0
}
pub fn query_with_temporal_context(
&self,
query: &str,
top_k: usize,
temporal: TemporalQueryContext<'_>,
) -> (Vec<SearchResult>, QueryTimings, QueryDiagnostics) {
let search_k = top_k.saturating_mul(5).max(20);
let total_start = Instant::now();
let (mut results, mut timings, diagnostics) = self.query_timed(query, search_k);
let (temporal_start, temporal_end) =
normalize_temporal_bounds(temporal.starts_from, temporal.ends_at);
let relative_anchor = temporal_start
.map(|_| temporal.starts_from)
.flatten()
.or(temporal_end.map(|_| temporal.ends_at).flatten())
.or(temporal.starts_from)
.or(temporal.ends_at);
let target = resolve_temporal_target(query, relative_anchor);
if target.is_none() && temporal_start.is_none() && temporal_end.is_none() {
timings.total_ms = total_start.elapsed().as_secs_f64() * 1000.0;
results.truncate(top_k);
return (results, timings, diagnostics);
}
let window_days = temporal.window_days.max(1);
let hard_filter = temporal.hard_filter;
let mut rescored = Vec::with_capacity(results.len());
for mut result in results.drain(..) {
let Some(doc) = self.docs.get(&result.doc_id) else {
continue;
};
let Some(doc_date) = doc_temporal_date(doc) else {
if hard_filter {
continue;
}
rescored.push(result);
continue;
};
if let Some(start) = temporal_start {
if doc_date < start && hard_filter {
continue;
}
}
if let Some(end) = temporal_end {
if doc_date > end && hard_filter {
continue;
}
}
let within_explicit_range = temporal_start.is_none_or(|start| doc_date >= start)
&& temporal_end.is_none_or(|end| doc_date <= end);
if within_explicit_range {
let explicit_boost = TEMPORAL_RANGE_BOOST;
result.score += explicit_boost;
result.score_breakdown.recency_score += explicit_boost;
}
if let Some(target) = target {
let delta_days = (doc_date
.signed_duration_since(target.target_date)
.num_days())
.abs();
if hard_filter && delta_days > window_days {
continue;
}
if delta_days <= window_days {
let proximity = 1.0 - (delta_days as f32 / window_days as f32);
let boost = proximity.clamp(0.0, 1.0) * TEMPORAL_PROXIMITY_WEIGHT;
result.score += boost;
result.score_breakdown.recency_score += boost;
}
}
rescored.push(result);
}
rescored.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
rescored.truncate(top_k);
timings.total_ms = total_start.elapsed().as_secs_f64() * 1000.0;
(rescored, timings, diagnostics)
}
pub fn query_timed(
&self,
query: &str,
top_k: usize,
) -> (Vec<SearchResult>, QueryTimings, QueryDiagnostics) {
let total_start = Instant::now();
let lexical_start = Instant::now();
let lexical_hits = match self.lexical_bm25(query, top_k.saturating_mul(5).max(20)) {
Ok(hits) => Some(hits),
Err(err) => {
eprintln!("warning: lexical BM25 query component failed: {}", err);
None
}
};
let lexical_bm25_ms = lexical_start.elapsed().as_secs_f64() * 1000.0;
let snapshot_start = Instant::now();
let (results, mut timings, diagnostics) =
self.query_with_lexical_hits_timed(query, top_k, lexical_hits.as_ref());
let snapshot_query_ms = snapshot_start.elapsed().as_secs_f64() * 1000.0;
let total_ms = total_start.elapsed().as_secs_f64() * 1000.0;
timings.snapshot_query_ms = snapshot_query_ms;
timings.lexical_bm25_ms = lexical_bm25_ms;
timings.total_ms = total_ms;
(results, timings, diagnostics)
}
pub fn query_with_lexical_hits(
&self,
query: &str,
top_k: usize,
lexical_hits: Option<&HashMap<String, f32>>,
) -> Vec<SearchResult> {
self.query_with_lexical_hits_timed(query, top_k, lexical_hits)
.0
}
fn query_with_lexical_hits_timed(
&self,
query: &str,
top_k: usize,
lexical_hits: Option<&HashMap<String, f32>>,
) -> (Vec<SearchResult>, QueryTimings, QueryDiagnostics) {
const MAX_QUERY_CHARS: usize = 4096;
const MAX_QUERY_TOKENS: usize = 128;
let rerank_start = Instant::now();
let parse_start = Instant::now();
let truncated = if query.len() > MAX_QUERY_CHARS {
let mut cut = 0usize;
for (idx, _) in query.char_indices() {
if idx > MAX_QUERY_CHARS {
break;
}
cut = idx;
}
&query[..cut]
} else {
query
};
let q = normalize_for_index(truncated);
if q.is_empty() {
return (
Vec::new(),
QueryTimings::default(),
QueryDiagnostics::default(),
);
}
let mut q_terms: Vec<String> = tokenize_query_terms(&q);
if q_terms.len() > MAX_QUERY_TOKENS {
q_terms.truncate(MAX_QUERY_TOKENS);
}
if q_terms.is_empty() {
q_terms.push(q.clone());
}
let expanded = expand_query_terms(&q_terms);
let expanded_terms = expanded.expanded_terms;
if self.doc_u32_to_id.is_empty() {
return (
Vec::new(),
QueryTimings::default(),
QueryDiagnostics::default(),
);
}
let parse_ms = parse_start.elapsed().as_secs_f64() * 1000.0;
let sparse_start = Instant::now();
let mut candidates: HashMap<usize, CandidateState> = HashMap::new();
let query_set: HashSet<String> = q_terms.iter().cloned().collect();
fn score_doc<F>(
candidates: &mut HashMap<usize, CandidateState>,
doc_u32: usize,
delta: f32,
apply: F,
) where
F: FnOnce(&mut CandidateState),
{
let entry = candidates.entry(doc_u32).or_default();
entry.score += delta;
apply(entry);
}
if let Some(lexical_hits) = lexical_hits {
for (doc_id, bm25_score) in lexical_hits {
if let Some(&doc_u32) = self.doc_id_to_u32.get(doc_id) {
score_doc(&mut candidates, doc_u32 as usize, *bm25_score, |entry| {
entry.breakdown.lexical_score += *bm25_score;
});
}
}
}
if let Some(entity_u32) = self.entity_trie.get(&q) {
if let Some(postings) = self.entity_postings_chunk.get(entity_u32 as usize) {
for (chunk_u32, post_score) in postings {
let doc_u32 = self.chunks[*chunk_u32 as usize].doc_u32;
let delta = FULL_QUERY_ENTITY_WEIGHT * *post_score;
score_doc(&mut candidates, doc_u32 as usize, delta, |entry| {
entry.breakdown.entity_score += delta;
entry.matched_entities.push(q.clone());
});
}
}
}
for term in &q_terms {
let mut entity_ids = Vec::new();
if let Some(entity_u32) = self.entity_trie.get(term) {
entity_ids.push((entity_u32, 1.0f32));
} else if term.len() >= 4 {
for entity_u32 in self.entity_trie.prefix_ids(term, 1) {
entity_ids.push((entity_u32, ENTITY_PREFIX_MULTIPLIER));
}
}
for (entity_u32, mult) in entity_ids {
if let Some(postings) = self.entity_postings_chunk.get(entity_u32 as usize) {
for (chunk_u32, post_score) in postings {
let doc_u32 = self.chunks[*chunk_u32 as usize].doc_u32;
let delta = ENTITY_TERM_WEIGHT * *post_score * mult;
score_doc(&mut candidates, doc_u32 as usize, delta, |entry| {
entry.breakdown.entity_score += delta;
entry.matched_entities.push(term.clone());
});
}
}
}
let mut term_ids = Vec::new();
if let Some(term_u32) = self.term_trie.get(term) {
term_ids.push((term_u32, 1.0f32));
} else if term.len() >= 4 {
for term_u32 in self.term_trie.prefix_ids(term, 1) {
term_ids.push((term_u32, IMPORTANT_TERM_PREFIX_MULTIPLIER));
}
}
for (term_u32, mult) in term_ids {
if let Some(postings) = self.term_postings_chunk.get(term_u32 as usize) {
for (chunk_u32, post_score) in postings {
let doc_u32 = self.chunks[*chunk_u32 as usize].doc_u32;
let delta = IMPORTANT_TERM_WEIGHT * *post_score * mult;
score_doc(&mut candidates, doc_u32 as usize, delta, |entry| {
entry.breakdown.term_score += delta;
entry.matched_terms.push(term.clone());
});
}
}
}
if self.claim_scoring {
if let Some(postings) = self.claim_to_docs.get(term) {
for posting in postings {
if let Some(&doc_u32) = self.doc_id_to_u32.get(&posting.doc_id) {
let delta = 0.7 * posting.score;
score_doc(&mut candidates, doc_u32 as usize, delta, |entry| {
entry.breakdown.claim_score += delta;
entry.matched_terms.push(term.clone());
});
}
}
}
}
}
for term in &expanded_terms {
if let Some(entity_u32) = self.entity_trie.get(term) {
if let Some(postings) = self.entity_postings_chunk.get(entity_u32 as usize) {
for (chunk_u32, post_score) in postings {
let doc_u32 = self.chunks[*chunk_u32 as usize].doc_u32;
let delta = EXPANDED_ENTITY_WEIGHT * *post_score;
score_doc(&mut candidates, doc_u32 as usize, delta, |entry| {
entry.breakdown.entity_score += delta;
});
}
}
}
if let Some(term_u32) = self.term_trie.get(term) {
if let Some(postings) = self.term_postings_chunk.get(term_u32 as usize) {
for (chunk_u32, post_score) in postings {
let doc_u32 = self.chunks[*chunk_u32 as usize].doc_u32;
let delta = EXPANDED_TERM_WEIGHT * *post_score;
score_doc(&mut candidates, doc_u32 as usize, delta, |entry| {
entry.breakdown.term_score += delta;
});
}
}
}
if self.claim_scoring {
if let Some(postings) = self.claim_to_docs.get(term) {
for posting in postings {
if let Some(&doc_u32) = self.doc_id_to_u32.get(&posting.doc_id) {
let delta = 0.7 * 0.6 * posting.score;
score_doc(&mut candidates, doc_u32 as usize, delta, |entry| {
entry.breakdown.claim_score += delta;
});
}
}
}
}
}
let sparse_scoring_ms = sparse_start.elapsed().as_secs_f64() * 1000.0;
let candidate_accumulation_start = Instant::now();
let metadata_start = Instant::now();
let candidate_doc_ids = candidates.keys().copied().collect::<Vec<_>>();
for doc_u32 in candidate_doc_ids {
let Some(doc_id) = self.doc_u32_to_id.get(doc_u32) else {
continue;
};
let Some(doc) = self.docs.get(doc_id) else {
continue;
};
let entry = candidates.entry(doc_u32).or_default();
if let Some(topic) = doc.probable_topic.as_ref() {
let topic_tokens = tokenize_query_terms(topic);
let overlap = topic_tokens
.iter()
.filter(|t| query_set.contains(*t))
.count();
if overlap > 0 {
let delta = TOPIC_OVERLAP_WEIGHT * overlap as f32;
entry.score += delta;
entry.breakdown.topic_score += delta;
}
}
if let Some(dt) = doc.doc_type_guess.as_ref() {
let dt_tokens = tokenize_query_terms(dt);
let overlap = dt_tokens.iter().filter(|t| query_set.contains(*t)).count();
if overlap > 0 {
let delta = DOC_TYPE_OVERLAP_WEIGHT * overlap as f32;
entry.score += delta;
entry.breakdown.doc_type_score += delta;
}
}
if self.claim_scoring && !doc.top_claims.is_empty() {
let mut best_claim_delta = 0.0f32;
for claim in &doc.top_claims {
let claim_terms = claim_tokens(claim);
if claim_terms.is_empty() {
continue;
}
let matched = claim_terms
.iter()
.filter(|token| query_set.contains(*token))
.count();
if matched == 0 {
continue;
}
let coverage = matched as f32 / claim_terms.len().max(1) as f32;
let mut delta = 0.45 * coverage * claim.confidence.max(0.1);
if matched >= 2 {
delta += 0.15 * claim.confidence.max(0.1);
}
if delta > best_claim_delta {
best_claim_delta = delta;
}
}
if best_claim_delta > 0.0 {
entry.score += best_claim_delta;
entry.breakdown.claim_score += best_claim_delta;
}
}
if doc_has_timestamped_chunk(doc) || doc.timestamp.is_some() {
let delta = TIMESTAMP_PRESENCE_WEIGHT;
entry.score += delta;
entry.breakdown.recency_score += delta;
}
}
let candidate_accumulation_ms =
candidate_accumulation_start.elapsed().as_secs_f64() * 1000.0;
let metadata_ms = metadata_start.elapsed().as_secs_f64() * 1000.0;
let candidate_rank_start = Instant::now();
let mut ranked_docs: Vec<(usize, f32)> = candidates
.iter()
.map(|(doc_u32, state)| (*doc_u32, state.score))
.collect();
ranked_docs.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
let candidate_rank_ms = candidate_rank_start.elapsed().as_secs_f64() * 1000.0;
let graph_start = Instant::now();
let anchors = ranked_docs.iter().take(5).cloned().collect::<Vec<_>>();
for (anchor_idx, anchor_score) in &anchors {
let anchor_doc_id = &self.doc_u32_to_id[*anchor_idx];
let Some(anchor_doc) = self.docs.get(anchor_doc_id) else {
continue;
};
let anchor_weight = (*anchor_score / (1.0 + *anchor_score)).clamp(0.0, 1.0);
for linked in &anchor_doc.doc_links {
if let Some(&doc_u32) = self.doc_id_to_u32.get(linked) {
let Some(entry) = candidates.get_mut(&(doc_u32 as usize)) else {
continue;
};
let delta = (DOC_LINK_GRAPH_WEIGHT * anchor_weight)
.min(GRAPH_MAX_BOOST - entry.breakdown.graph_link_score);
if delta > 0.0 {
entry.score += delta;
entry.breakdown.graph_link_score += delta;
}
}
}
}
let graph_ms = graph_start.elapsed().as_secs_f64() * 1000.0;
let entity_graph_start = Instant::now();
let mut anchor_entities: HashSet<String> = HashSet::new();
for (doc_u32, _) in &anchors {
if let Some(keys) = self.doc_key_entities.get(*doc_u32) {
for key in keys {
if !key.is_empty() {
anchor_entities.insert(key.clone());
}
}
}
}
if !anchor_entities.is_empty() {
let candidate_doc_ids = ranked_docs
.iter()
.take(ENTITY_GRAPH_MAX_CANDIDATES)
.map(|(doc_u32, _)| *doc_u32)
.collect::<Vec<_>>();
for doc_u32 in candidate_doc_ids {
let Some(entry) = candidates.get_mut(&doc_u32) else {
continue;
};
let Some(keys) = self.doc_key_entities.get(doc_u32) else {
continue;
};
let overlap = keys.iter().filter(|k| anchor_entities.contains(*k)).count();
if overlap > 0 {
let delta = (ENTITY_GRAPH_WEIGHT * overlap as f32)
.min(GRAPH_MAX_BOOST - entry.breakdown.entity_graph_score);
if delta > 0.0 {
entry.score += delta;
entry.breakdown.entity_graph_score += delta;
}
}
}
}
let entity_graph_ms = entity_graph_start.elapsed().as_secs_f64() * 1000.0;
let sequence_rerank_start = Instant::now();
let rerank_window = ranked_docs
.iter()
.take(TEXT_RERANK_WINDOW.min(FINAL_RERANK_WINDOW))
.map(|(doc_u32, _)| *doc_u32)
.collect::<Vec<_>>();
if !q_terms.is_empty() {
for doc_u32 in rerank_window {
let Some(doc_id) = self.doc_u32_to_id.get(doc_u32) else {
continue;
};
let Some(doc) = self.docs.get(doc_id) else {
continue;
};
let candidate_text = candidate_rerank_text(doc);
let candidate_tokens = tokenize_query_terms(&normalize_for_index(&candidate_text));
if candidate_tokens.is_empty() {
continue;
}
let token_overlap = token_overlap_ratio(&q_terms, &candidate_tokens);
let mut delta = TEXT_RERANK_WEIGHT * token_overlap;
if self.text_rerank_ngram {
let ngram_overlap =
ngram_overlap_ratio(&q_terms, &candidate_tokens, TEXT_RERANK_NGRAM_SIZE);
delta += TEXT_RERANK_NGRAM_WEIGHT * ngram_overlap;
}
if self.text_rerank_lcs {
let lcs_overlap = lcs_ratio(&q_terms, &candidate_tokens);
delta += TEXT_RERANK_LCS_WEIGHT * lcs_overlap;
}
if delta <= 0.0 {
continue;
}
if let Some(entry) = candidates.get_mut(&doc_u32) {
entry.score += delta;
entry.breakdown.sequence_rerank_score += delta;
}
}
}
let sequence_rerank_ms = sequence_rerank_start.elapsed().as_secs_f64() * 1000.0;
let mut ranked_docs: Vec<(usize, f32)> = candidates
.iter()
.filter_map(|(doc_u32, state)| {
if state.score > 0.0 {
Some((*doc_u32, state.score))
} else {
None
}
})
.collect();
ranked_docs.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
let ranking_start = Instant::now();
let ranked_docs = ranked_docs
.into_iter()
.take(FINAL_RERANK_WINDOW.min(self.doc_u32_to_id.len()))
.collect::<Vec<_>>();
let mut grouped_ranked_docs: HashMap<String, Vec<(usize, f32)>> = HashMap::new();
let mut group_metadata: HashMap<String, (Option<String>, String)> = HashMap::new();
for (doc_u32, score) in ranked_docs {
let Some(doc_id) = self.doc_u32_to_id.get(doc_u32) else {
continue;
};
let Some(doc) = self.docs.get(doc_id) else {
continue;
};
let group_key = doc
.group_id
.clone()
.unwrap_or_else(|| format!("doc:{}", doc.doc_id));
grouped_ranked_docs
.entry(group_key.clone())
.or_default()
.push((doc_u32, score));
group_metadata
.entry(group_key)
.or_insert_with(|| (doc.group_id.clone(), doc.source.clone()));
}
let mut ranked_groups: Vec<(String, f32, Vec<(usize, f32)>)> = grouped_ranked_docs
.into_iter()
.map(|(group_key, mut items)| {
items.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
let score = aggregate_group_score(&items);
(group_key, score, items)
})
.collect();
ranked_groups.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
let mut per_group_counts: HashMap<String, usize> = HashMap::new();
let mut results = Vec::new();
for (group_key, _, items) in ranked_groups {
let (group_id, _) = group_metadata
.get(&group_key)
.cloned()
.unwrap_or((None, String::new()));
for (doc_u32, score) in items {
if results.len() >= top_k.min(self.doc_u32_to_id.len()) {
break;
}
let Some(doc_id) = self.doc_u32_to_id.get(doc_u32).cloned() else {
continue;
};
let Some(doc) = self.docs.get(&doc_id) else {
continue;
};
if let Some(group) = group_id.as_ref() {
let count = per_group_counts.entry(group.clone()).or_default();
if *count >= MAX_RESULTS_PER_GROUP {
continue;
}
*count += 1;
}
let state = candidates.get(&doc_u32);
results.push(SearchResult {
doc_id,
source: doc.source.clone(),
group_id: group_id.clone(),
score,
score_breakdown: state.map(|s| s.breakdown.clone()).unwrap_or_default(),
matched_entities: state
.map(|s| {
let mut v = s.matched_entities.clone();
v.sort();
v.dedup();
v
})
.unwrap_or_default(),
matched_terms: state
.map(|s| {
let mut v = s.matched_terms.clone();
v.sort();
v.dedup();
v
})
.unwrap_or_default(),
probable_topic: doc.probable_topic.clone(),
doc_type_guess: doc.doc_type_guess.clone(),
});
}
if results.len() >= top_k.min(self.doc_u32_to_id.len()) {
break;
}
}
let ranking_ms = ranking_start.elapsed().as_secs_f64() * 1000.0;
let rerank_ms = rerank_start.elapsed().as_secs_f64() * 1000.0;
(
results,
QueryTimings {
total_ms: rerank_ms,
refresh_ms: 0.0,
lexical_bm25_ms: 0.0,
snapshot_query_ms: rerank_ms,
rerank_ms,
parse_ms,
sparse_scoring_ms,
candidate_accumulation_ms,
candidate_rank_ms,
metadata_ms,
graph_ms,
entity_graph_ms,
sequence_rerank_ms,
ranking_ms,
},
QueryDiagnostics {
query_terms: q_terms.len(),
expanded_terms: expanded_terms.len(),
lexical_hits: lexical_hits.map_or(0, HashMap::len),
candidates: candidates.len(),
},
)
}
pub fn redacted_for_export(&self) -> RedactedMemoryIndex {
let docs = self
.docs
.iter()
.map(|(id, d)| {
(
id.clone(),
RedactedDocRecord {
doc_id: d.doc_id.clone(),
source: d.source.clone(),
timestamp: d.timestamp.clone(),
doc_length: d.doc_length,
group_id: d.group_id.clone(),
probable_topic: d.probable_topic.clone(),
doc_type_guess: d.doc_type_guess.clone(),
provenance: d.provenance.clone(),
},
)
})
.collect();
RedactedMemoryIndex {
docs,
topic_to_docs: self.topic_to_docs.clone(),
doc_type_to_docs: self.doc_type_to_docs.clone(),
}
}
pub fn impacted_chunks_for_line_range(
&self,
doc_id: &str,
start_line: usize,
end_line: usize,
) -> Vec<String> {
let Some(&doc_u32) = self.doc_id_to_u32.get(doc_id) else {
return Vec::new();
};
let Some(tree) = self.doc_interval_trees.get(doc_u32 as usize) else {
return Vec::new();
};
let mut chunk_ids = tree
.query(start_line.max(1), end_line.max(start_line))
.into_iter()
.filter_map(|chunk_u32| {
self.chunks
.get(chunk_u32 as usize)
.map(|m| m.chunk_id.clone())
})
.collect::<Vec<_>>();
chunk_ids.sort();
chunk_ids.dedup();
chunk_ids
}
}
pub(crate) fn build_semantic_doc_state(
record: &DocRecord,
claim_scoring: bool,
) -> SemanticDocState {
let mut state = SemanticDocState {
doc_id: record.doc_id.clone(),
chunk_ids: Vec::new(),
chunks: HashMap::new(),
entity_to_docs: HashMap::new(),
term_to_docs: HashMap::new(),
claim_to_docs: HashMap::new(),
topic: record
.probable_topic
.as_ref()
.map(|topic| topic.to_lowercase()),
doc_type: record
.doc_type_guess
.as_ref()
.map(|doc_type| doc_type.to_lowercase()),
};
let chunks = if record.section_chunks.is_empty() {
vec![SectionChunk {
chunk_id: stable_chunk_id(
&record.doc_id,
record
.headings
.first()
.map(String::as_str)
.unwrap_or("(document)"),
&record.content,
1,
record.content.lines().count().max(1),
),
heading: record
.headings
.first()
.cloned()
.unwrap_or_else(|| "(document)".to_string()),
content: record.content.clone(),
start_line: 1,
end_line: record.content.lines().count().max(1),
timestamp: record.timestamp.clone(),
key_entities: record
.key_entities
.iter()
.map(|e| normalize_for_index(&e.text))
.filter(|v| !v.is_empty())
.collect(),
important_terms: record
.important_terms
.iter()
.map(|t| normalize_for_index(&t.term))
.filter(|v| !v.is_empty())
.collect(),
}]
} else {
record.section_chunks.clone()
};
for chunk in &chunks {
state.chunk_ids.push(chunk.chunk_id.clone());
state.chunks.insert(
chunk.chunk_id.clone(),
SemanticChunkState {
chunk_id: chunk.chunk_id.clone(),
doc_id: record.doc_id.clone(),
heading: chunk.heading.clone(),
start_line: chunk.start_line,
end_line: chunk.end_line,
key_entities: chunk.key_entities.clone(),
important_terms: chunk.important_terms.clone(),
},
);
for term in &chunk.important_terms {
let key = normalize_for_index(term);
if key.is_empty() {
continue;
}
state
.term_to_docs
.entry(key)
.or_default()
.push(TermPosting {
doc_id: record.doc_id.clone(),
score: 0.8,
});
}
for token in tokenize_query_terms(&chunk.heading) {
state
.term_to_docs
.entry(token)
.or_default()
.push(TermPosting {
doc_id: record.doc_id.clone(),
score: 0.4,
});
}
for entity in &chunk.key_entities {
let key = normalize_for_index(entity);
if key.is_empty() {
continue;
}
state
.entity_to_docs
.entry(key)
.or_default()
.push(EntityPosting {
doc_id: record.doc_id.clone(),
score: 0.9,
});
}
}
if claim_scoring {
for claim in &record.top_claims {
for token in claim_tokens(claim) {
state
.claim_to_docs
.entry(token)
.or_default()
.push(TermPosting {
doc_id: record.doc_id.clone(),
score: claim.confidence.max(0.1),
});
}
}
}
for postings in state.entity_to_docs.values_mut() {
postings.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
}
for postings in state.term_to_docs.values_mut() {
postings.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
}
for postings in state.claim_to_docs.values_mut() {
postings.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
}
state
}
impl SemanticAggregate {
pub(crate) fn insert_doc_state(&mut self, state: &SemanticDocState) {
self.doc_to_chunks
.insert(state.doc_id.clone(), state.chunk_ids.clone());
for chunk_id in &state.chunk_ids {
self.chunk_to_doc
.insert(chunk_id.clone(), state.doc_id.clone());
if let Some(chunk) = state.chunks.get(chunk_id) {
self.chunk_ranges
.insert(chunk_id.clone(), (chunk.start_line, chunk.end_line));
for term in &chunk.important_terms {
let key = normalize_for_index(term);
if key.is_empty() {
continue;
}
self.term_to_chunks
.entry(key)
.or_default()
.push((chunk_id.clone(), 0.8));
}
for token in tokenize_query_terms(&chunk.heading) {
self.term_to_chunks
.entry(token)
.or_default()
.push((chunk_id.clone(), 0.4));
}
for entity in &chunk.key_entities {
let key = normalize_for_index(entity);
if key.is_empty() {
continue;
}
self.entity_to_chunks
.entry(key)
.or_default()
.push((chunk_id.clone(), 0.9));
}
}
}
for (key, postings) in &state.entity_to_docs {
self.entity_to_docs
.entry(key.clone())
.or_default()
.extend(postings.clone());
if let Some(entries) = self.entity_to_docs.get_mut(key) {
entries.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
}
}
for (key, postings) in &state.term_to_docs {
self.term_to_docs
.entry(key.clone())
.or_default()
.extend(postings.clone());
if let Some(entries) = self.term_to_docs.get_mut(key) {
entries.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
}
}
for (key, postings) in &state.claim_to_docs {
self.claim_to_docs
.entry(key.clone())
.or_default()
.extend(postings.clone());
if let Some(entries) = self.claim_to_docs.get_mut(key) {
entries.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
}
}
if let Some(topic) = state.topic.as_ref() {
self.topic_to_docs
.entry(topic.clone())
.or_default()
.push(state.doc_id.clone());
}
if let Some(doc_type) = state.doc_type.as_ref() {
self.doc_type_to_docs
.entry(doc_type.clone())
.or_default()
.push(state.doc_id.clone());
}
}
pub(crate) fn remove_doc(&mut self, doc_id: &str) {
if let Some(chunk_ids) = self.doc_to_chunks.remove(doc_id) {
for chunk_id in chunk_ids {
self.chunk_to_doc.remove(&chunk_id);
self.chunk_ranges.remove(&chunk_id);
for postings in self.term_to_chunks.values_mut() {
postings.retain(|(id, _)| id != &chunk_id);
}
for postings in self.entity_to_chunks.values_mut() {
postings.retain(|(id, _)| id != &chunk_id);
}
}
}
self.term_to_chunks
.retain(|_, postings| !postings.is_empty());
self.entity_to_chunks
.retain(|_, postings| !postings.is_empty());
for postings in self.entity_to_docs.values_mut() {
postings.retain(|posting| posting.doc_id != doc_id);
}
self.entity_to_docs
.retain(|_, postings| !postings.is_empty());
for postings in self.term_to_docs.values_mut() {
postings.retain(|posting| posting.doc_id != doc_id);
}
self.term_to_docs.retain(|_, postings| !postings.is_empty());
for postings in self.claim_to_docs.values_mut() {
postings.retain(|posting| posting.doc_id != doc_id);
}
self.claim_to_docs.retain(|_, postings| !postings.is_empty());
for docs in self.topic_to_docs.values_mut() {
docs.retain(|id| id != doc_id);
}
self.topic_to_docs.retain(|_, docs| !docs.is_empty());
for docs in self.doc_type_to_docs.values_mut() {
docs.retain(|id| id != doc_id);
}
self.doc_type_to_docs.retain(|_, docs| !docs.is_empty());
}
}
fn tokenize_query_terms(input: &str) -> Vec<String> {
static TOKEN_RE: OnceLock<Regex> = OnceLock::new();
let token_re =
TOKEN_RE.get_or_init(|| Regex::new(r"[A-Za-z][A-Za-z0-9_-]{2,}").expect("valid regex"));
token_re
.find_iter(input)
.map(|m| m.as_str().to_lowercase())
.collect()
}
fn claim_tokens(claim: &Claim) -> Vec<String> {
let mut tokens = Vec::new();
tokens.extend(tokenize_query_terms(&normalize_for_index(&claim.subject)));
tokens.extend(tokenize_query_terms(&normalize_for_index(&claim.predicate)));
tokens.extend(tokenize_query_terms(&normalize_for_index(&claim.object)));
tokens.sort();
tokens.dedup();
tokens
}
fn sanitize_bm25_query(query: &str) -> String {
static FALLBACK_RE: OnceLock<Regex> = OnceLock::new();
let lowered = deunicode(query).to_lowercase();
let token_re = FALLBACK_RE.get_or_init(|| {
Regex::new(r"[A-Za-z0-9][A-Za-z0-9_-]*").expect("valid fallback query regex")
});
token_re
.find_iter(&lowered)
.map(|m| m.as_str())
.collect::<Vec<_>>()
.join(" ")
}
fn candidate_rerank_text(doc: &DocRecord) -> String {
let mut parts: Vec<String> = Vec::new();
if !doc.headings.is_empty() {
parts.push(doc.headings.join(" "));
}
if let Some(topic) = doc.probable_topic.as_ref() {
parts.push(topic.clone());
}
if let Some(doc_type) = doc.doc_type_guess.as_ref() {
parts.push(doc_type.clone());
}
if !doc.important_terms.is_empty() {
parts.push(
doc.important_terms
.iter()
.map(|t| t.term.clone())
.collect::<Vec<_>>()
.join(" "),
);
}
if !doc.key_entities.is_empty() {
parts.push(
doc.key_entities
.iter()
.map(|e| e.text.clone())
.collect::<Vec<_>>()
.join(" "),
);
}
if !doc.temporal_terms.is_empty() {
parts.push(doc.temporal_terms.join(" "));
}
let content_snippet: String = doc
.content
.chars()
.take(TEXT_RERANK_CONTENT_CHARS)
.collect();
if !content_snippet.is_empty() {
parts.push(content_snippet);
}
parts.join(" ")
}
fn doc_has_timestamped_chunk(doc: &DocRecord) -> bool {
doc.section_chunks
.iter()
.any(|chunk| chunk.timestamp.is_some())
}
const TEMPORAL_PROXIMITY_WEIGHT: f32 = 0.45;
const TEMPORAL_RANGE_BOOST: f32 = 0.15;
fn doc_temporal_date(doc: &DocRecord) -> Option<NaiveDate> {
doc.section_chunks
.iter()
.find_map(|chunk| parse_temporal_date(chunk.timestamp.as_deref()))
.or_else(|| parse_temporal_date(doc.timestamp.as_deref()))
}
fn normalize_temporal_bounds(
starts_from: Option<&str>,
ends_at: Option<&str>,
) -> (Option<NaiveDate>, Option<NaiveDate>) {
let start = parse_temporal_date(starts_from);
let end = parse_temporal_date(ends_at);
match (start, end) {
(Some(start), Some(end)) if start > end => (Some(end), Some(start)),
(start, end) => (start, end),
}
}
fn aggregate_group_score(items: &[(usize, f32)]) -> f32 {
if items.is_empty() {
return 0.0;
}
let base = items[0].1;
let support = items
.iter()
.skip(1)
.take(2)
.map(|(_, score)| *score)
.sum::<f32>();
let coverage = items.len().saturating_sub(1).min(3) as f32;
base + 0.16 * support + 0.03 * coverage
}
fn normalized_entity_keys(entities: &[Tier1Entity]) -> Vec<String> {
let mut out = Vec::new();
let mut seen = HashSet::new();
for entity in entities {
let key = normalize_for_index(&entity.text);
if key.is_empty() || !seen.insert(key.clone()) {
continue;
}
out.push(key);
}
out
}
fn token_overlap_ratio(query_tokens: &[String], candidate_tokens: &[String]) -> f32 {
if query_tokens.is_empty() || candidate_tokens.is_empty() {
return 0.0;
}
let candidate_set: HashSet<&str> = candidate_tokens.iter().map(String::as_str).collect();
let matched = query_tokens
.iter()
.map(String::as_str)
.filter(|t| candidate_set.contains(t))
.count();
matched as f32 / query_tokens.len().max(1) as f32
}
fn ngram_overlap_ratio(query_tokens: &[String], candidate_tokens: &[String], n: usize) -> f32 {
if n == 0 || query_tokens.len() < n || candidate_tokens.len() < n {
return 0.0;
}
let query_ngrams = build_ngrams(query_tokens, n);
if query_ngrams.is_empty() {
return 0.0;
}
let candidate_ngrams = build_ngrams(candidate_tokens, n);
if candidate_ngrams.is_empty() {
return 0.0;
}
let candidate_set: HashSet<Vec<String>> = candidate_ngrams.into_iter().collect();
let overlap = query_ngrams
.into_iter()
.filter(|g| candidate_set.contains(g))
.count();
overlap as f32 / query_tokens.len().saturating_sub(n - 1).max(1) as f32
}
fn build_ngrams(tokens: &[String], n: usize) -> Vec<Vec<String>> {
if n == 0 || tokens.len() < n {
return Vec::new();
}
tokens
.windows(n)
.map(|win| win.iter().cloned().collect::<Vec<_>>())
.collect()
}
fn lcs_ratio(query_tokens: &[String], candidate_tokens: &[String]) -> f32 {
if query_tokens.is_empty() || candidate_tokens.is_empty() {
return 0.0;
}
let mut prev = vec![0usize; candidate_tokens.len() + 1];
let mut cur = vec![0usize; candidate_tokens.len() + 1];
for q in query_tokens {
for (j, cand) in candidate_tokens.iter().enumerate() {
if q == cand {
cur[j + 1] = prev[j] + 1;
} else {
cur[j + 1] = cur[j].max(prev[j + 1]);
}
}
std::mem::swap(&mut prev, &mut cur);
}
prev[candidate_tokens.len()] as f32 / query_tokens.len().max(1) as f32
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn text_overlap_prefers_matching_candidate() {
let query = tokenize_query_terms(&normalize_for_index("What degree did I graduate with?"));
let good = tokenize_query_terms(&normalize_for_index(
"I graduated with a computer science degree.",
));
let bad = tokenize_query_terms(&normalize_for_index("I cooked pasta and watched a movie."));
assert!(token_overlap_ratio(&query, &good) > token_overlap_ratio(&query, &bad));
assert!(ngram_overlap_ratio(&query, &good, 2) > ngram_overlap_ratio(&query, &bad, 2));
assert!(lcs_ratio(&query, &good) > lcs_ratio(&query, &bad));
}
#[test]
fn timestamped_chunk_detects_chunk_level_temporal_anchor() {
let doc = DocRecord {
doc_id: "doc-1".to_string(),
source: "source://doc-1".to_string(),
content: "content".to_string(),
timestamp: None,
doc_length: 7,
author_agent: None,
group_id: None,
probable_topic: None,
doc_type_guess: None,
headings: vec!["Overview".to_string()],
doc_links: vec![],
temporal_terms: vec![],
key_entities: vec![],
important_terms: vec![],
section_chunks: vec![SectionChunk {
chunk_id: "doc-1::0".to_string(),
heading: "Overview".to_string(),
content: "content".to_string(),
start_line: 1,
end_line: 1,
timestamp: Some("2024-05-10".to_string()),
key_entities: vec![],
important_terms: vec![],
}],
embedding: None,
top_claims: vec![],
provenance: Provenance {
source: "source://doc-1".to_string(),
timestamp: None,
ner_provider: "heuristic".to_string(),
term_ranker: "yake".to_string(),
index_version: "v1".to_string(),
},
};
assert!(doc_has_timestamped_chunk(&doc));
}
#[test]
fn normalize_temporal_bounds_orders_reversed_ranges_and_preserves_same_day() {
let (start, end) = normalize_temporal_bounds(Some("2024-05-10"), Some("2024-05-01"));
assert_eq!(
start,
Some(chrono::NaiveDate::from_ymd_opt(2024, 5, 1).expect("valid date"))
);
assert_eq!(
end,
Some(chrono::NaiveDate::from_ymd_opt(2024, 5, 10).expect("valid date"))
);
let (same_start, same_end) =
normalize_temporal_bounds(Some("2024-05-10"), Some("2024-05-10"));
assert_eq!(same_start, same_end);
assert_eq!(
same_start,
Some(chrono::NaiveDate::from_ymd_opt(2024, 5, 10).expect("valid date"))
);
}
}