Skip to main content

oxihuman_core/
search_index.rs

1// Copyright (C) 2026 COOLJAPAN OU (Team KitaSan)
2// SPDX-License-Identifier: Apache-2.0
3#![allow(dead_code)]
4
5//! Inverted-index based text search over string-keyed documents.
6
7use std::collections::HashMap;
8
9/// A document stored in the index.
10#[allow(dead_code)]
11#[derive(Debug, Clone)]
12pub struct SearchDoc {
13    pub id: u64,
14    pub title: String,
15    pub body: String,
16}
17
18/// Simple inverted index that maps lowercase tokens to document ids.
19#[allow(dead_code)]
20pub struct SearchIndex {
21    docs: HashMap<u64, SearchDoc>,
22    index: HashMap<String, Vec<u64>>,
23    next_id: u64,
24    total_indexed: usize,
25}
26
27#[allow(dead_code)]
28impl SearchIndex {
29    pub fn new() -> Self {
30        Self {
31            docs: HashMap::new(),
32            index: HashMap::new(),
33            next_id: 0,
34            total_indexed: 0,
35        }
36    }
37
38    /// Tokenise text into lowercase words.
39    fn tokenize(text: &str) -> Vec<String> {
40        text.split(|c: char| !c.is_alphanumeric())
41            .filter(|s| !s.is_empty())
42            .map(|s| s.to_lowercase())
43            .collect()
44    }
45
46    /// Add a document; returns its assigned id.
47    pub fn insert(&mut self, title: &str, body: &str) -> u64 {
48        let id = self.next_id;
49        self.next_id += 1;
50        let doc = SearchDoc {
51            id,
52            title: title.to_string(),
53            body: body.to_string(),
54        };
55        let tokens = Self::tokenize(&format!("{} {}", title, body));
56        self.total_indexed += tokens.len();
57        for tok in tokens {
58            self.index.entry(tok).or_default().push(id);
59        }
60        self.docs.insert(id, doc);
61        id
62    }
63
64    /// Remove a document by id.
65    pub fn remove(&mut self, id: u64) -> bool {
66        if self.docs.remove(&id).is_none() {
67            return false;
68        }
69        for ids in self.index.values_mut() {
70            ids.retain(|&i| i != id);
71        }
72        true
73    }
74
75    /// Search for documents containing all query tokens. Returns doc ids.
76    pub fn search(&self, query: &str) -> Vec<u64> {
77        let tokens = Self::tokenize(query);
78        if tokens.is_empty() {
79            return Vec::new();
80        }
81        let mut result: Option<Vec<u64>> = None;
82        for tok in &tokens {
83            let ids: Vec<u64> = self.index.get(tok).cloned().unwrap_or_default();
84            result = Some(match result {
85                None => ids,
86                Some(prev) => {
87                    let mut set: Vec<u64> = prev;
88                    set.retain(|id| ids.contains(id));
89                    set
90                }
91            });
92        }
93        let mut out = result.unwrap_or_default();
94        out.sort_unstable();
95        out.dedup();
96        out
97    }
98
99    /// Retrieve a document by id.
100    pub fn get(&self, id: u64) -> Option<&SearchDoc> {
101        self.docs.get(&id)
102    }
103
104    /// Number of documents in the index.
105    pub fn doc_count(&self) -> usize {
106        self.docs.len()
107    }
108
109    /// Number of unique tokens across all documents.
110    pub fn token_count(&self) -> usize {
111        self.index.len()
112    }
113
114    /// Total tokens indexed (with duplicates).
115    pub fn total_indexed(&self) -> usize {
116        self.total_indexed
117    }
118
119    pub fn is_empty(&self) -> bool {
120        self.docs.is_empty()
121    }
122
123    pub fn clear(&mut self) {
124        self.docs.clear();
125        self.index.clear();
126        self.total_indexed = 0;
127    }
128}
129
130impl Default for SearchIndex {
131    fn default() -> Self {
132        Self::new()
133    }
134}
135
136pub fn new_search_index() -> SearchIndex {
137    SearchIndex::new()
138}
139
140#[cfg(test)]
141mod tests {
142    use super::*;
143
144    #[test]
145    fn insert_and_search() {
146        let mut idx = new_search_index();
147        idx.insert("hello world", "foo bar");
148        let hits = idx.search("hello");
149        assert_eq!(hits.len(), 1);
150    }
151
152    #[test]
153    fn multi_token_and() {
154        let mut idx = new_search_index();
155        idx.insert("alpha beta", "body");
156        idx.insert("alpha only", "stuff");
157        let hits = idx.search("alpha beta");
158        assert_eq!(hits.len(), 1);
159    }
160
161    #[test]
162    fn no_match_returns_empty() {
163        let mut idx = new_search_index();
164        idx.insert("rust programming", "systems");
165        assert!(idx.search("java").is_empty());
166    }
167
168    #[test]
169    fn case_insensitive() {
170        let mut idx = new_search_index();
171        idx.insert("Hello World", "body");
172        let hits = idx.search("HELLO");
173        assert_eq!(hits.len(), 1);
174    }
175
176    #[test]
177    fn remove_document() {
178        let mut idx = new_search_index();
179        let id = idx.insert("test doc", "");
180        assert!(idx.remove(id));
181        assert!(idx.search("test").is_empty());
182    }
183
184    #[test]
185    fn doc_count() {
186        let mut idx = new_search_index();
187        idx.insert("a", "");
188        idx.insert("b", "");
189        assert_eq!(idx.doc_count(), 2);
190    }
191
192    #[test]
193    fn get_doc() {
194        let mut idx = new_search_index();
195        let id = idx.insert("title", "content");
196        let doc = idx.get(id).expect("should succeed");
197        assert_eq!(doc.title, "title");
198    }
199
200    #[test]
201    fn empty_query_returns_empty() {
202        let mut idx = new_search_index();
203        idx.insert("something", "");
204        assert!(idx.search("").is_empty());
205    }
206
207    #[test]
208    fn clear_index() {
209        let mut idx = new_search_index();
210        idx.insert("doc", "text");
211        idx.clear();
212        assert!(idx.is_empty());
213    }
214
215    #[test]
216    fn token_count_nonzero_after_insert() {
217        let mut idx = new_search_index();
218        idx.insert("unique token here", "");
219        assert!(idx.token_count() > 0);
220    }
221}