book_searcher_core/
lib.rs

1use serde::{Deserialize, Serialize};
2use serde_with::{serde_as, DefaultOnError, DefaultOnNull};
3use std::path::Path;
4pub use tantivy::store::Compressor;
5use tantivy::{
6    query::QueryParser, schema::*, store::ZstdCompressor, tokenizer::TextAnalyzer, Index,
7    TantivyError,
8};
9use tantivy_meta_tokenizer::{get_tokenizer, META_TOKENIZER};
10
11pub mod index;
12pub mod search;
13
14#[serde_as]
15#[derive(Debug, Default, Serialize, Deserialize)]
16pub struct Book {
17    pub id: u64,
18
19    pub title: String,
20    #[serde_as(deserialize_as = "DefaultOnNull")]
21    pub author: String,
22    #[serde_as(deserialize_as = "DefaultOnNull")]
23    pub publisher: String,
24    #[serde_as(deserialize_as = "DefaultOnNull")]
25    pub extension: String,
26    #[serde_as(deserialize_as = "DefaultOnError")]
27    pub filesize: u64,
28    #[serde_as(deserialize_as = "DefaultOnNull")]
29    pub language: String,
30    #[serde_as(deserialize_as = "DefaultOnError")]
31    pub year: u64,
32    #[serde_as(deserialize_as = "DefaultOnError")]
33    pub pages: u64,
34    #[serde_as(deserialize_as = "DefaultOnNull")]
35    pub isbn: String,
36    #[serde_as(deserialize_as = "DefaultOnNull")]
37    pub ipfs_cid: String,
38}
39
40impl From<(&Schema, Document)> for Book {
41    fn from((schema, doc): (&Schema, Document)) -> Self {
42        macro_rules! get_field_text {
43            ($field:expr) => {
44                doc.get_first(schema.get_field($field).unwrap())
45                    .unwrap()
46                    .as_text()
47                    .unwrap_or_default()
48                    .to_owned()
49            };
50        }
51
52        macro_rules! get_field_u64 {
53            ($field:expr) => {
54                doc.get_first(schema.get_field($field).unwrap())
55                    .unwrap()
56                    .as_u64()
57                    .unwrap_or_default()
58            };
59        }
60
61        Book {
62            id: get_field_u64!("id"),
63            title: get_field_text!("title"),
64            author: get_field_text!("author"),
65            publisher: get_field_text!("publisher"),
66            extension: get_field_text!("extension"),
67            filesize: get_field_u64!("filesize"),
68            language: get_field_text!("language"),
69            year: get_field_u64!("year"),
70            pages: get_field_u64!("pages"),
71            isbn: get_field_text!("isbn"),
72            ipfs_cid: get_field_text!("ipfs_cid"),
73        }
74    }
75}
76
77#[derive(Clone)]
78pub struct Searcher {
79    pub compressor: Compressor,
80
81    index: Index,
82    schema: Schema,
83    query_parser: QueryParser,
84    tokenizer: TextAnalyzer,
85
86    // fields
87    id: Field,
88    title: Field,
89    author: Field,
90    publisher: Field,
91    publisher_exist: Field,
92    extension: Field,
93    filesize: Field,
94    language: Field,
95    year: Field,
96    pages: Field,
97    isbn: Field,
98    ipfs_cid: Field,
99}
100
101impl Searcher {
102    pub fn new(index_dir: impl AsRef<Path>) -> Self {
103        let text_indexing = TextFieldIndexing::default()
104            .set_tokenizer(META_TOKENIZER)
105            .set_index_option(IndexRecordOption::WithFreqsAndPositions);
106        let text_options = TextOptions::default()
107            .set_indexing_options(text_indexing)
108            .set_stored();
109
110        let mut schema_builder = Schema::builder();
111        let id = schema_builder.add_u64_field("id", INDEXED | STORED);
112        let title = schema_builder.add_text_field("title", text_options.clone());
113        let author = schema_builder.add_text_field("author", text_options.clone());
114        let publisher = schema_builder.add_text_field("publisher", text_options);
115        // publisher_exist is for score tweaking
116        let publisher_exist = schema_builder.add_bool_field("publisher_exist", FAST);
117        let extension = schema_builder.add_text_field("extension", STRING | STORED);
118        let filesize = schema_builder.add_u64_field("filesize", STORED);
119        let language = schema_builder.add_text_field("language", TEXT | STORED);
120        let year = schema_builder.add_u64_field("year", STORED);
121        let pages = schema_builder.add_u64_field("pages", STORED | FAST);
122        let isbn = schema_builder.add_text_field("isbn", TEXT | STORED);
123        let ipfs_cid = schema_builder.add_text_field("ipfs_cid", STORED);
124        let schema = schema_builder.build();
125
126        // open or create index
127        let index_dir = index_dir.as_ref();
128        let mut index = Index::open_in_dir(index_dir).unwrap_or_else(|err| {
129            if let TantivyError::OpenDirectoryError(_) | TantivyError::OpenReadError(_) = err {
130                std::fs::create_dir_all(index_dir).expect("create index directory");
131                Index::create_in_dir(index_dir, schema.clone()).unwrap()
132            } else {
133                panic!("Error opening index: {err:?}")
134            }
135        });
136
137        let tokenizer = get_tokenizer();
138        index
139            .tokenizers()
140            .register(META_TOKENIZER, tokenizer.clone());
141        _ = index.set_default_multithread_executor();
142
143        let mut query_parser = QueryParser::for_index(&index, vec![title, author, publisher, isbn]);
144        query_parser.set_conjunction_by_default();
145
146        Self {
147            compressor: Compressor::Brotli,
148
149            index,
150            schema,
151            query_parser,
152            tokenizer,
153
154            id,
155            title,
156            author,
157            publisher,
158            publisher_exist,
159            extension,
160            filesize,
161            language,
162            year,
163            pages,
164            isbn,
165            ipfs_cid,
166        }
167    }
168
169    pub fn set_compressor(&mut self, compressor: &str) {
170        let compressor = match compressor {
171            "none" => Compressor::None,
172            "lz4" => Compressor::Lz4,
173            "brotli" => Compressor::Brotli,
174            "snappy" => Compressor::Snappy,
175            _ => {
176                if compressor.starts_with("zstd") {
177                    Compressor::Zstd(ZstdCompressor::default())
178                } else {
179                    println!(
180                        "compressor not valid: {:#?}",
181                        ["none", "lz4", "brotli", "snappy", "zstd",]
182                    );
183                    std::process::exit(1);
184                }
185            }
186        };
187
188        self.index.settings_mut().docstore_compression = compressor;
189    }
190}