typstify_search/
schema.rs

1//! Search schema definition for Tantivy.
2//!
3//! Defines the search index schema with fields for title, body, URL, language, tags, and date.
4
5use tantivy::{
6    Index,
7    schema::{
8        DateOptions, FAST, Field, STORED, STRING, Schema, SchemaBuilder, TextFieldIndexing,
9        TextOptions,
10    },
11    tokenizer::{LowerCaser, SimpleTokenizer, TextAnalyzer},
12};
13
14/// Search schema field references.
15#[derive(Debug, Clone)]
16pub struct SearchFields {
17    /// Page title (TEXT | STORED).
18    pub title: Field,
19
20    /// Page body content (TEXT).
21    pub body: Field,
22
23    /// Page URL (STRING | STORED).
24    pub url: Field,
25
26    /// Language code (STRING | STORED | FAST).
27    pub lang: Field,
28
29    /// Tags (TEXT | STORED).
30    pub tags: Field,
31
32    /// Publication date (DATE | STORED | FAST).
33    pub date: Field,
34}
35
36/// Create the search schema with all required fields.
37///
38/// Returns the schema and field references for indexing.
39pub fn create_search_schema() -> (Schema, SearchFields) {
40    let mut builder = SchemaBuilder::new();
41
42    // Title field: full-text searchable and stored for display
43    let title_options = TextOptions::default()
44        .set_indexing_options(
45            TextFieldIndexing::default()
46                .set_tokenizer("default")
47                .set_index_option(tantivy::schema::IndexRecordOption::WithFreqsAndPositions),
48        )
49        .set_stored();
50    let title = builder.add_text_field("title", title_options);
51
52    // Body field: full-text searchable, not stored (too large)
53    let body_options = TextOptions::default().set_indexing_options(
54        TextFieldIndexing::default()
55            .set_tokenizer("default")
56            .set_index_option(tantivy::schema::IndexRecordOption::WithFreqsAndPositions),
57    );
58    let body = builder.add_text_field("body", body_options);
59
60    // URL field: exact match, stored for results
61    let url = builder.add_text_field("url", STRING | STORED);
62
63    // Language field: exact match, stored, fast for filtering
64    let lang = builder.add_text_field("lang", STRING | STORED | FAST);
65
66    // Tags field: searchable and stored
67    let tags_options = TextOptions::default()
68        .set_indexing_options(
69            TextFieldIndexing::default()
70                .set_tokenizer("default")
71                .set_index_option(tantivy::schema::IndexRecordOption::WithFreqs),
72        )
73        .set_stored();
74    let tags = builder.add_text_field("tags", tags_options);
75
76    // Date field: stored and fast for sorting/filtering
77    let date_options = DateOptions::default().set_stored().set_fast();
78    let date = builder.add_date_field("date", date_options);
79
80    let schema = builder.build();
81    let fields = SearchFields {
82        title,
83        body,
84        url,
85        lang,
86        tags,
87        date,
88    };
89
90    (schema, fields)
91}
92
93/// Register custom tokenizers for the search index.
94///
95/// Sets up the default tokenizer with lowercase normalization.
96pub fn register_tokenizers(index: &Index) {
97    let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
98        .filter(LowerCaser)
99        .build();
100
101    index.tokenizers().register("default", tokenizer);
102}
103
104#[cfg(test)]
105mod tests {
106    use super::*;
107
108    #[test]
109    fn test_create_schema() {
110        let (schema, fields) = create_search_schema();
111
112        // Verify all fields exist
113        assert!(schema.get_field("title").is_ok());
114        assert!(schema.get_field("body").is_ok());
115        assert!(schema.get_field("url").is_ok());
116        assert!(schema.get_field("lang").is_ok());
117        assert!(schema.get_field("tags").is_ok());
118        assert!(schema.get_field("date").is_ok());
119
120        // Verify field references match schema
121        assert_eq!(fields.title, schema.get_field("title").unwrap());
122        assert_eq!(fields.body, schema.get_field("body").unwrap());
123        assert_eq!(fields.url, schema.get_field("url").unwrap());
124    }
125
126    #[test]
127    fn test_title_field_is_stored() {
128        let (schema, fields) = create_search_schema();
129        let field_entry = schema.get_field_entry(fields.title);
130
131        assert!(field_entry.is_indexed());
132        // TextOptions doesn't have a direct is_stored method in schema,
133        // but we configured it with set_stored()
134        assert_eq!(field_entry.name(), "title");
135    }
136
137    #[test]
138    fn test_url_field_is_string() {
139        let (schema, fields) = create_search_schema();
140        let field_entry = schema.get_field_entry(fields.url);
141
142        assert!(field_entry.is_indexed());
143        assert_eq!(field_entry.name(), "url");
144    }
145
146    #[test]
147    fn test_register_tokenizers() {
148        let (schema, _) = create_search_schema();
149        let index = Index::create_in_ram(schema);
150
151        register_tokenizers(&index);
152
153        // Verify the tokenizer is registered
154        let tokenizer = index.tokenizers().get("default");
155        assert!(tokenizer.is_some());
156    }
157}