Skip to main content

ox_content_search/
index.rs

1//! Search index data structures.
2
3use std::collections::HashMap;
4
5use serde::{Deserialize, Serialize};
6
7use crate::tokenizer::tokenize;
8
9/// A searchable document in the index.
10#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct SearchDocument {
12    /// Unique document identifier (usually the URL path).
13    pub id: String,
14    /// Document title.
15    pub title: String,
16    /// Document URL/path.
17    pub url: String,
18    /// Main content text.
19    pub body: String,
20    /// Headings in the document.
21    pub headings: Vec<String>,
22    /// Code snippets (optional).
23    #[serde(default)]
24    pub code: Vec<String>,
25}
26
27/// Posting list entry for inverted index.
28#[derive(Debug, Clone, Serialize, Deserialize)]
29pub struct Posting {
30    /// Document index in the documents array.
31    pub doc_idx: usize,
32    /// Term frequency in this document.
33    pub tf: u32,
34    /// Field where term was found (for boosting).
35    pub field: Field,
36}
37
38/// Document fields with different boost weights.
39#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
40pub enum Field {
41    /// Title field (highest weight).
42    Title,
43    /// Headings (high weight).
44    Heading,
45    /// Body text (normal weight).
46    Body,
47    /// Code blocks (lower weight).
48    Code,
49}
50
51impl Field {
52    /// Returns the boost factor for this field.
53    #[must_use]
54    pub fn boost(self) -> f64 {
55        match self {
56            Self::Title => 10.0,
57            Self::Heading => 5.0,
58            Self::Body => 1.0,
59            Self::Code => 0.5,
60        }
61    }
62}
63
64/// The main search index structure.
65#[derive(Debug, Clone, Serialize, Deserialize)]
66pub struct SearchIndex {
67    /// All indexed documents.
68    pub documents: Vec<SearchDocument>,
69    /// Inverted index: term -> list of postings.
70    pub index: HashMap<String, Vec<Posting>>,
71    /// Document frequency: term -> number of documents containing term.
72    pub df: HashMap<String, usize>,
73    /// Average document length (for BM25).
74    pub avg_dl: f64,
75    /// Total number of documents.
76    pub doc_count: usize,
77}
78
79impl SearchIndex {
80    /// Serializes the index to JSON.
81    #[must_use]
82    pub fn to_json(&self) -> String {
83        serde_json::to_string(self).unwrap_or_default()
84    }
85
86    /// Serializes the index to compact JSON.
87    #[must_use]
88    pub fn to_json_compact(&self) -> String {
89        serde_json::to_string(self).unwrap_or_default()
90    }
91
92    /// Deserializes an index from JSON.
93    pub fn from_json(json: &str) -> Result<Self, serde_json::Error> {
94        serde_json::from_str(json)
95    }
96
97    /// Returns the number of documents in the index.
98    #[must_use]
99    pub fn len(&self) -> usize {
100        self.documents.len()
101    }
102
103    /// Returns true if the index is empty.
104    #[must_use]
105    pub fn is_empty(&self) -> bool {
106        self.documents.is_empty()
107    }
108}
109
110/// Builder for constructing a search index.
111#[derive(Debug, Default)]
112pub struct SearchIndexBuilder {
113    documents: Vec<SearchDocument>,
114}
115
116impl SearchIndexBuilder {
117    /// Creates a new index builder.
118    #[must_use]
119    pub fn new() -> Self {
120        Self::default()
121    }
122
123    /// Adds a document to the index.
124    pub fn add_document(&mut self, doc: SearchDocument) -> &mut Self {
125        self.documents.push(doc);
126        self
127    }
128
129    /// Adds a simple document with just id, title, and body.
130    pub fn add_simple(&mut self, id: &str, title: &str, url: &str, body: &str) -> &mut Self {
131        self.documents.push(SearchDocument {
132            id: id.to_string(),
133            title: title.to_string(),
134            url: url.to_string(),
135            body: body.to_string(),
136            headings: Vec::new(),
137            code: Vec::new(),
138        });
139        self
140    }
141
142    /// Builds the search index.
143    #[must_use]
144    pub fn build(self) -> SearchIndex {
145        let mut index: HashMap<String, Vec<Posting>> = HashMap::new();
146        let mut df: HashMap<String, usize> = HashMap::new();
147        let mut total_length = 0usize;
148
149        for (doc_idx, doc) in self.documents.iter().enumerate() {
150            let mut doc_terms: HashMap<String, (u32, Field)> = HashMap::new();
151
152            // Index title
153            for token in tokenize(&doc.title) {
154                doc_terms
155                    .entry(token)
156                    .and_modify(|(count, _)| *count += 1)
157                    .or_insert((1, Field::Title));
158            }
159
160            // Index headings
161            for heading in &doc.headings {
162                for token in tokenize(heading) {
163                    doc_terms
164                        .entry(token)
165                        .and_modify(|(count, _)| *count += 1)
166                        .or_insert((1, Field::Heading));
167                }
168            }
169
170            // Index body
171            let body_tokens = tokenize(&doc.body);
172            total_length += body_tokens.len();
173            for token in body_tokens {
174                doc_terms
175                    .entry(token)
176                    .and_modify(|(count, _)| *count += 1)
177                    .or_insert((1, Field::Body));
178            }
179
180            // Index code
181            for code in &doc.code {
182                for token in tokenize(code) {
183                    doc_terms
184                        .entry(token)
185                        .and_modify(|(count, _)| *count += 1)
186                        .or_insert((1, Field::Code));
187                }
188            }
189
190            // Update document frequency and inverted index
191            for (term, (tf, field)) in doc_terms {
192                *df.entry(term.clone()).or_insert(0) += 1;
193                index.entry(term).or_default().push(Posting { doc_idx, tf, field });
194            }
195        }
196
197        let doc_count = self.documents.len();
198        #[allow(clippy::cast_precision_loss)]
199        let avg_dl = if doc_count > 0 { total_length as f64 / doc_count as f64 } else { 0.0 };
200
201        SearchIndex { documents: self.documents, index, df, avg_dl, doc_count }
202    }
203}
204
205#[cfg(test)]
206mod tests {
207    use super::*;
208
209    #[test]
210    fn test_build_index() {
211        let mut builder = SearchIndexBuilder::new();
212        builder.add_simple(
213            "1",
214            "Getting Started",
215            "/getting-started",
216            "Welcome to the documentation",
217        );
218        builder.add_simple("2", "Installation", "/installation", "How to install the package");
219
220        let index = builder.build();
221
222        assert_eq!(index.len(), 2);
223        assert!(index.index.contains_key("getting"));
224        assert!(index.index.contains_key("started"));
225        assert!(index.index.contains_key("install"));
226    }
227
228    #[test]
229    fn test_serialize_deserialize() {
230        let mut builder = SearchIndexBuilder::new();
231        builder.add_simple("1", "Test", "/test", "Test content");
232
233        let index = builder.build();
234        let json = index.to_json();
235        let restored = SearchIndex::from_json(&json).unwrap();
236
237        assert_eq!(restored.len(), 1);
238        assert_eq!(restored.documents[0].title, "Test");
239    }
240}