use std::collections::{BTreeMap, BTreeSet, HashMap};
use crate::Properties;
use super::StoredIndexEntity;
pub(super) type TermCounts = BTreeMap<String, u32>;
pub(super) type PropertyTermCounts = BTreeMap<String, TermCounts>;
pub fn standard_analyzer(text: &str) -> Vec<String> {
let mut out = Vec::new();
let mut buf = String::new();
for ch in text.chars() {
if ch.is_alphanumeric() {
for low in ch.to_lowercase() {
buf.push(low);
}
} else if !buf.is_empty() {
out.push(std::mem::take(&mut buf));
}
}
if !buf.is_empty() {
out.push(buf);
}
out
}
#[derive(Debug, Default, Clone)]
pub(super) struct FulltextRegistry {
by_name: HashMap<String, FulltextIndex>,
}
impl FulltextRegistry {
pub(super) fn register(&mut self, name: String, labels: Vec<String>, properties: Vec<String>) {
let entry = FulltextIndex::new(labels, properties);
self.by_name.insert(name, entry);
}
pub(super) fn deregister(&mut self, name: &str) {
self.by_name.remove(name);
}
pub(super) fn get(&self, name: &str) -> Option<&FulltextIndex> {
self.by_name.get(name)
}
pub(super) fn get_mut(&mut self, name: &str) -> Option<&mut FulltextIndex> {
self.by_name.get_mut(name)
}
pub(super) fn iter(&self) -> impl Iterator<Item = (&String, &FulltextIndex)> {
self.by_name.iter()
}
pub(super) fn by_name_mut(&mut self) -> impl Iterator<Item = (&String, &mut FulltextIndex)> {
self.by_name.iter_mut()
}
pub(super) fn indexes_for_labels<'a, I>(
&'a self,
labels: I,
) -> impl Iterator<Item = (&'a String, &'a FulltextIndex)>
where
I: IntoIterator<Item = &'a str>,
I::IntoIter: Clone,
{
let labels = labels.into_iter();
self.by_name
.iter()
.filter(move |(_, idx)| idx.covers_any_label(labels.clone()))
}
pub(super) fn indexes_for_labels_mut<'a, I>(
&'a mut self,
labels: I,
) -> impl Iterator<Item = (&'a String, &'a mut FulltextIndex)>
where
I: IntoIterator<Item = &'a str>,
I::IntoIter: Clone,
{
let labels = labels.into_iter();
self.by_name
.iter_mut()
.filter(move |(_, idx)| idx.covers_any_label(labels.clone()))
}
pub(super) fn remove_entity_everywhere(&mut self, entity_id: u64) {
for index in self.by_name.values_mut() {
index.remove_entity(entity_id);
}
}
}
#[derive(Debug, Clone)]
pub(super) struct FulltextIndex {
pub labels: Vec<String>,
pub properties: Vec<String>,
postings: BTreeMap<String, BTreeMap<u64, u32>>,
entity_terms: BTreeMap<u64, BTreeSet<String>>,
}
impl FulltextIndex {
fn new(labels: Vec<String>, properties: Vec<String>) -> Self {
Self {
labels,
properties,
postings: BTreeMap::new(),
entity_terms: BTreeMap::new(),
}
}
pub(super) fn property_is_covered(&self, property: &str) -> bool {
self.properties.iter().any(|p| p == property)
}
pub(super) fn covers_any_label<'a>(&self, labels: impl IntoIterator<Item = &'a str>) -> bool {
labels
.into_iter()
.any(|label| self.labels.iter().any(|wanted| wanted == label))
}
pub(super) fn reindex_entity(&mut self, entity_id: u64, terms: TermCounts) {
if let Some(old_terms) = self.entity_terms.remove(&entity_id) {
for term in old_terms {
if let Some(bucket) = self.postings.get_mut(&term) {
bucket.remove(&entity_id);
if bucket.is_empty() {
self.postings.remove(&term);
}
}
}
}
if terms.is_empty() {
return;
}
let mut new_terms = BTreeSet::new();
for (term, tf) in terms {
self.postings
.entry(term.clone())
.or_default()
.insert(entity_id, tf);
new_terms.insert(term);
}
self.entity_terms.insert(entity_id, new_terms);
}
pub(super) fn remove_entity(&mut self, entity_id: u64) {
if let Some(terms) = self.entity_terms.remove(&entity_id) {
for term in terms {
if let Some(bucket) = self.postings.get_mut(&term) {
bucket.remove(&entity_id);
if bucket.is_empty() {
self.postings.remove(&term);
}
}
}
}
}
pub(super) fn query(&self, query_text: &str) -> Vec<(u64, f64)> {
let tokens = standard_analyzer(query_text);
if tokens.is_empty() {
return Vec::new();
}
let mut posting_iter: Vec<&BTreeMap<u64, u32>> = Vec::with_capacity(tokens.len());
for token in &tokens {
match self.postings.get(token) {
Some(p) => posting_iter.push(p),
None => return Vec::new(), }
}
posting_iter.sort_by_key(|p| p.len());
let mut results: BTreeMap<u64, u32> = BTreeMap::new();
let Some(seed) = posting_iter.first() else {
return Vec::new();
};
for (id, tf) in *seed {
results.insert(*id, *tf);
}
for posting in posting_iter.iter().skip(1) {
let mut next: BTreeMap<u64, u32> = BTreeMap::new();
for (id, acc) in &results {
if let Some(tf) = posting.get(id) {
next.insert(*id, acc.saturating_add(*tf));
}
}
results = next;
if results.is_empty() {
return Vec::new();
}
}
let mut out: Vec<(u64, f64)> = results
.into_iter()
.map(|(id, tf)| (id, tf as f64))
.collect();
out.sort_by(|a, b| {
b.1.partial_cmp(&a.1)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| a.0.cmp(&b.0))
});
out
}
}
pub(super) fn tokenize_to_term_counts(value: &str) -> TermCounts {
let mut out = TermCounts::new();
for tok in standard_analyzer(value) {
*out.entry(tok).or_insert(0) += 1;
}
out
}
pub(super) fn string_property_term_counts(properties: &Properties) -> PropertyTermCounts {
let mut out = PropertyTermCounts::new();
for (key, value) in properties {
if let crate::PropertyValue::String(value) = value {
out.insert(key.clone(), tokenize_to_term_counts(value));
}
}
out
}
pub(super) fn term_counts_for_properties(
properties: &Properties,
selected_properties: &[String],
) -> TermCounts {
let by_property = string_property_term_counts(properties);
term_counts_for_selected_properties(&by_property, selected_properties)
}
pub(super) fn term_counts_for_selected_properties(
by_property: &PropertyTermCounts,
selected_properties: &[String],
) -> TermCounts {
let mut out = TermCounts::new();
for property in selected_properties {
if let Some(counts) = by_property.get(property) {
merge_term_counts(&mut out, counts.clone());
}
}
out
}
pub(super) fn merge_term_counts(into: &mut TermCounts, more: TermCounts) {
for (k, v) in more {
*into.entry(k).or_insert(0) += v;
}
}
#[allow(dead_code)]
pub(super) fn registry_for_entity_kind(entity: StoredIndexEntity) -> &'static str {
match entity {
StoredIndexEntity::Node => "node",
StoredIndexEntity::Relationship => "relationship",
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn analyzer_lowercases_and_splits_on_punct() {
assert_eq!(
standard_analyzer("Hello, World! 42"),
vec!["hello", "world", "42"]
);
}
#[test]
fn analyzer_collapses_runs() {
assert_eq!(standard_analyzer("a b\tc\n d"), vec!["a", "b", "c", "d"]);
}
#[test]
fn reindex_replaces_old_contribution() {
let mut idx = FulltextIndex::new(vec!["L".into()], vec!["p".into()]);
idx.reindex_entity(1, tokenize_to_term_counts("foo bar"));
idx.reindex_entity(1, tokenize_to_term_counts("baz"));
let r = idx.query("foo");
assert!(r.is_empty(), "expected empty after reindex, got {r:?}");
let r = idx.query("baz");
assert_eq!(r, vec![(1, 1.0)]);
}
#[test]
fn query_intersects_terms() {
let mut idx = FulltextIndex::new(vec!["L".into()], vec!["p".into()]);
idx.reindex_entity(1, tokenize_to_term_counts("alpha beta gamma"));
idx.reindex_entity(2, tokenize_to_term_counts("alpha gamma delta"));
let r = idx.query("alpha beta");
assert_eq!(r, vec![(1, 2.0)], "only entity 1 has both terms");
}
#[test]
fn query_returns_empty_for_unknown_term() {
let mut idx = FulltextIndex::new(vec!["L".into()], vec!["p".into()]);
idx.reindex_entity(1, tokenize_to_term_counts("alpha"));
assert!(idx.query("zeta").is_empty());
}
}