use std::collections::{HashMap, HashSet};
#[derive(Debug, Clone)]
pub struct SearchHit {
pub subject: String,
pub predicate: String,
pub literal: String,
pub score: f64,
}
#[derive(Debug, Clone)]
struct IndexedDoc {
id: u32,
subject: String,
predicate: String,
literal: String,
tokens: Vec<String>,
}
pub struct TextSearchIndex {
inverted: HashMap<String, HashSet<u32>>,
docs: HashMap<u32, IndexedDoc>,
sp_index: HashMap<(String, String), u32>,
next_id: u32,
stop_words: HashSet<&'static str>,
}
impl TextSearchIndex {
const STOP_WORDS: &'static [&'static str] = &[
"a", "an", "and", "are", "as", "at", "be", "been", "by", "for", "from", "has", "have",
"he", "in", "is", "it", "its", "of", "on", "or", "she", "that", "the", "their", "there",
"they", "this", "to", "was", "were", "will", "with",
];
pub fn new() -> Self {
TextSearchIndex {
inverted: HashMap::new(),
docs: HashMap::new(),
sp_index: HashMap::new(),
next_id: 0,
stop_words: Self::STOP_WORDS.iter().cloned().collect(),
}
}
pub fn index_triple(&mut self, subject: &str, predicate: &str, literal: &str) {
let sp_key = (subject.to_string(), predicate.to_string());
if let Some(old_id) = self.sp_index.remove(&sp_key) {
self.remove_doc_by_id(old_id);
}
let id = self.next_id;
self.next_id = self.next_id.saturating_add(1);
let tokens = self.tokenize_and_filter(literal);
let term_set: HashSet<String> = tokens.iter().cloned().collect();
for term in &term_set {
self.inverted.entry(term.clone()).or_default().insert(id);
}
let doc = IndexedDoc {
id,
subject: subject.to_string(),
predicate: predicate.to_string(),
literal: literal.to_string(),
tokens,
};
self.docs.insert(id, doc);
self.sp_index.insert(sp_key, id);
}
pub fn remove_triple(&mut self, subject: &str, predicate: &str) -> bool {
let sp_key = (subject.to_string(), predicate.to_string());
if let Some(id) = self.sp_index.remove(&sp_key) {
self.remove_doc_by_id(id);
true
} else {
false
}
}
pub fn search(&self, query: &str) -> Vec<SearchHit> {
let query_terms = Self::tokenize_raw(query);
if query_terms.is_empty() {
return Vec::new();
}
let n = self.docs.len() as f64;
if n == 0.0 {
return Vec::new();
}
let mut scores: HashMap<u32, f64> = HashMap::new();
for term in &query_terms {
let posting = match self.inverted.get(term) {
Some(set) => set,
None => continue,
};
let df = posting.len() as f64;
let idf = ((n + 1.0) / (df + 1.0)).ln() + 1.0;
for &doc_id in posting {
if let Some(doc) = self.docs.get(&doc_id) {
let tf = doc.tokens.iter().filter(|t| *t == term).count() as f64;
*scores.entry(doc_id).or_insert(0.0) += tf * idf;
}
}
}
let mut hits: Vec<(u32, f64)> = scores.into_iter().collect();
hits.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
hits.into_iter()
.filter_map(|(doc_id, score)| {
let doc = self.docs.get(&doc_id)?;
Some(SearchHit {
subject: doc.subject.clone(),
predicate: doc.predicate.clone(),
literal: doc.literal.clone(),
score,
})
})
.collect()
}
pub fn predicates_for_subject(&self, subject: &str) -> Vec<String> {
self.sp_index
.keys()
.filter(|(s, _)| s == subject)
.map(|(_, p)| p.clone())
.collect()
}
pub fn size(&self) -> usize {
self.inverted.len()
}
pub fn document_count(&self) -> usize {
self.docs.len()
}
fn tokenize_raw(text: &str) -> Vec<String> {
text.to_lowercase()
.split(|c: char| !c.is_alphanumeric())
.filter(|t| t.len() > 1)
.map(|t| t.to_string())
.collect()
}
fn tokenize_and_filter(&self, text: &str) -> Vec<String> {
Self::tokenize_raw(text)
.into_iter()
.filter(|t| !self.stop_words.contains(t.as_str()))
.collect()
}
fn remove_doc_by_id(&mut self, id: u32) {
if let Some(doc) = self.docs.remove(&id) {
let term_set: HashSet<String> = doc.tokens.into_iter().collect();
for term in &term_set {
if let Some(posting) = self.inverted.get_mut(term) {
posting.remove(&id);
}
}
self.inverted.retain(|_, posting| !posting.is_empty());
}
}
}
impl Default for TextSearchIndex {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
fn subj(n: u32) -> String {
format!("http://ex.org/s{}", n)
}
fn pred() -> &'static str {
"http://ex.org/label"
}
#[test]
fn test_index_and_search_basic() {
let mut idx = TextSearchIndex::new();
idx.index_triple(&subj(1), pred(), "Rust programming language systems");
idx.index_triple(&subj(2), pred(), "Python scripting language");
let hits = idx.search("rust programming");
assert!(!hits.is_empty(), "Should find 'rust programming'");
assert_eq!(hits[0].subject, subj(1));
}
#[test]
fn test_empty_search_returns_empty() {
let idx = TextSearchIndex::new();
let hits = idx.search("rust");
assert!(hits.is_empty());
}
#[test]
fn test_search_no_match_returns_empty() {
let mut idx = TextSearchIndex::new();
idx.index_triple(&subj(1), pred(), "hello world");
let hits = idx.search("zzzyyyxxx");
assert!(hits.is_empty());
}
#[test]
fn test_tfidf_higher_tf_ranks_higher() {
let mut idx = TextSearchIndex::new();
idx.index_triple(&subj(1), pred(), "database management systems");
idx.index_triple(&subj(2), pred(), "database database database storage");
let hits = idx.search("database");
assert_eq!(hits.len(), 2, "Both docs should match");
assert_eq!(hits[0].subject, subj(2), "Higher TF should rank first");
}
#[test]
fn test_tfidf_score_positive() {
let mut idx = TextSearchIndex::new();
idx.index_triple(&subj(1), pred(), "semantic web ontology");
let hits = idx.search("semantic");
assert_eq!(hits.len(), 1);
assert!(hits[0].score > 0.0, "Score should be positive");
}
#[test]
fn test_tfidf_multiterm_sum() {
let mut idx = TextSearchIndex::new();
idx.index_triple(&subj(1), pred(), "knowledge graph reasoning inference");
idx.index_triple(
&subj(2),
pred(),
"knowledge knowledge graph graph data modeling",
);
let hits = idx.search("knowledge graph");
assert_eq!(hits.len(), 2);
assert_eq!(hits[0].subject, subj(2));
}
#[test]
fn test_remove_triple_removes_document() {
let mut idx = TextSearchIndex::new();
idx.index_triple(&subj(1), pred(), "hello world");
idx.index_triple(&subj(2), pred(), "hello rust");
idx.remove_triple(&subj(1), pred());
let hits = idx.search("hello");
assert_eq!(hits.len(), 1, "Only s2 should remain");
assert_eq!(hits[0].subject, subj(2));
}
#[test]
fn test_remove_nonexistent_returns_false() {
let mut idx = TextSearchIndex::new();
let removed = idx.remove_triple(&subj(99), pred());
assert!(!removed, "Removing nonexistent should return false");
}
#[test]
fn test_remove_cleans_inverted_index() {
let mut idx = TextSearchIndex::new();
idx.index_triple(&subj(1), pred(), "uniquetermzyx");
assert_eq!(idx.size(), 1);
idx.remove_triple(&subj(1), pred());
assert_eq!(
idx.size(),
0,
"Inverted index should be empty after removal"
);
}
#[test]
fn test_reindex_same_sp_replaces() {
let mut idx = TextSearchIndex::new();
idx.index_triple(&subj(1), pred(), "old value content");
idx.index_triple(&subj(1), pred(), "new value content");
let hits = idx.search("old");
assert!(
hits.iter().all(|h| h.subject != subj(1)),
"'old' should be gone after re-index"
);
let hits = idx.search("new");
assert!(hits.iter().any(|h| h.subject == subj(1)));
}
#[test]
fn test_reindex_doc_count_unchanged() {
let mut idx = TextSearchIndex::new();
idx.index_triple(&subj(1), pred(), "version one");
assert_eq!(idx.document_count(), 1);
idx.index_triple(&subj(1), pred(), "version two");
assert_eq!(idx.document_count(), 1, "Re-index should not add a new doc");
}
#[test]
fn test_stop_words_not_indexed() {
let mut idx = TextSearchIndex::new();
idx.index_triple(&subj(1), pred(), "the and or is");
assert_eq!(idx.size(), 0, "Stop words alone should produce empty index");
}
#[test]
fn test_stop_words_mixed_with_content() {
let mut idx = TextSearchIndex::new();
idx.index_triple(&subj(1), pred(), "the quick brown fox");
let hits = idx.search("quick");
assert!(!hits.is_empty(), "Non-stop-word should be found");
}
#[test]
fn test_size_reflects_term_count() {
let mut idx = TextSearchIndex::new();
assert_eq!(idx.size(), 0);
idx.index_triple(&subj(1), pred(), "alpha beta gamma");
assert!(idx.size() >= 3, "Should have at least 3 terms");
}
#[test]
fn test_document_count() {
let mut idx = TextSearchIndex::new();
assert_eq!(idx.document_count(), 0);
idx.index_triple(&subj(1), pred(), "doc one");
assert_eq!(idx.document_count(), 1);
idx.index_triple(&subj(2), pred(), "doc two");
assert_eq!(idx.document_count(), 2);
}
#[test]
fn test_multiple_predicates_same_subject() {
let mut idx = TextSearchIndex::new();
idx.index_triple(&subj(1), "http://ex.org/title", "Semantic Web");
idx.index_triple(&subj(1), "http://ex.org/desc", "linked data platform");
assert_eq!(idx.document_count(), 2);
let title_hits = idx.search("semantic");
assert!(!title_hits.is_empty());
let desc_hits = idx.search("linked");
assert!(!desc_hits.is_empty());
}
#[test]
fn test_search_results_sorted_descending() {
let mut idx = TextSearchIndex::new();
idx.index_triple(&subj(1), pred(), "sparql query language protocol");
idx.index_triple(
&subj(2),
pred(),
"sparql sparql query query endpoint server",
);
idx.index_triple(&subj(3), pred(), "sparql endpoint");
let hits = idx.search("sparql query");
for w in hits.windows(2) {
assert!(
w[0].score >= w[1].score,
"Results should be sorted descending by score"
);
}
}
#[test]
fn test_hit_fields_populated() {
let mut idx = TextSearchIndex::new();
idx.index_triple(
"http://ex.org/subject",
"http://ex.org/predicate",
"test literal value",
);
let hits = idx.search("literal");
assert_eq!(hits.len(), 1);
let hit = &hits[0];
assert_eq!(hit.subject, "http://ex.org/subject");
assert_eq!(hit.predicate, "http://ex.org/predicate");
assert_eq!(hit.literal, "test literal value");
assert!(hit.score > 0.0);
}
}