lb_rs/subscribers/
search.rs

1use crate::model::errors::{LbResult, Unexpected};
2use crate::model::file::File;
3use crate::service::activity::RankingWeights;
4use crate::service::events::Event;
5use crate::Lb;
6use serde::Serialize;
7use std::ops::Range;
8use std::sync::atomic::AtomicBool;
9use std::sync::Arc;
10use tantivy::collector::TopDocs;
11use tantivy::query::QueryParser;
12use tantivy::schema::{Schema, Value, STORED, TEXT};
13use tantivy::{
14    doc, Index, IndexReader, IndexWriter, ReloadPolicy, SnippetGenerator, TantivyDocument, Term,
15};
16use tokio::sync::RwLock;
17use uuid::Uuid;
18
19const CONTENT_MAX_LEN_BYTES: usize = 128 * 1024; // 128kb
20
21#[derive(Clone)]
22pub struct SearchIndex {
23    pub ready: Arc<AtomicBool>,
24
25    pub metadata_index: Arc<RwLock<SearchMetadata>>,
26    pub tantivy_index: Index,
27    pub tantivy_reader: IndexReader,
28}
29
30#[derive(Copy, Clone, Debug)]
31pub enum SearchConfig {
32    Paths,
33    Documents,
34    PathsAndDocuments,
35}
36
37#[derive(Debug)]
38pub enum SearchResult {
39    DocumentMatch { id: Uuid, path: String, content_matches: Vec<ContentMatch> },
40    PathMatch { id: Uuid, path: String, matched_indices: Vec<usize>, score: i64 },
41}
42
43impl Lb {
44    /// Lockbook's search implementation.
45    ///
46    /// Takes an input and a configuration. The configuration describes whether we are searching
47    /// paths, documents or both.
48    ///
49    /// Document searches are handled by [tantivy](https://github.com/quickwit-oss/tantivy), and as
50    /// such support [tantivy's advanced query
51    /// syntax](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html).
52    /// In the future we plan to ingest a bunch of metadata and expose a full advanced search mode.
53    ///
54    /// Path searches are implemented as a subsequence filter with a number of hueristics to sort
55    /// the results. Preference is given to shorter paths, filename matches, suggested docs, and
56    /// documents that are editable in platform.
57    ///
58    /// Additionally if a path search contains a string, greater than 8 characters long that is
59    /// contained within any of the paths in the search index, that result is returned with the
60    /// highest score. lb:// style ids are also supported.
61    #[instrument(level = "debug", skip(self), err(Debug))]
62    pub async fn search(&self, input: &str, cfg: SearchConfig) -> LbResult<Vec<SearchResult>> {
63        // show suggested docs if the input string is empty
64        if input.is_empty() {
65            return self.search.metadata_index.read().await.empty_search();
66        }
67
68        match cfg {
69            SearchConfig::Paths => {
70                let mut results = self.search.metadata_index.read().await.path_search(input)?;
71                results.truncate(5);
72                Ok(results)
73            }
74            SearchConfig::Documents => {
75                let mut results = self.search_content(input).await?;
76                results.truncate(10);
77                Ok(results)
78            }
79            SearchConfig::PathsAndDocuments => {
80                let mut results = self.search.metadata_index.read().await.path_search(input)?;
81                results.truncate(4);
82                results.append(&mut self.search_content(input).await?);
83                Ok(results)
84            }
85        }
86    }
87
88    async fn search_content(&self, input: &str) -> LbResult<Vec<SearchResult>> {
89        let searcher = self.search.tantivy_reader.searcher();
90        let schema = self.search.tantivy_index.schema();
91        let id_field = schema.get_field("id").unwrap();
92        let content = schema.get_field("content").unwrap();
93
94        let query_parser = QueryParser::for_index(&self.search.tantivy_index, vec![content]);
95        let mut results = vec![];
96
97        if let Ok(query) = query_parser.parse_query(input) {
98            let mut snippet_generator =
99                SnippetGenerator::create(&searcher, &query, content).map_unexpected()?;
100            snippet_generator.set_max_num_chars(100);
101
102            let top_docs = searcher
103                .search(&query, &TopDocs::with_limit(10))
104                .map_unexpected()?;
105
106            for (_score, doc_address) in top_docs {
107                let retrieved_doc: TantivyDocument = searcher.doc(doc_address).map_unexpected()?;
108                let id = Uuid::from_slice(
109                    retrieved_doc
110                        .get_first(id_field)
111                        .map(|val| val.as_bytes().unwrap_or_default())
112                        .unwrap_or_default(),
113                )
114                .map_unexpected()?;
115
116                let snippet = snippet_generator.snippet_from_doc(&retrieved_doc);
117                let path = self
118                    .search
119                    .metadata_index
120                    .read()
121                    .await
122                    .paths
123                    .iter()
124                    .find(|(path_id, _)| *path_id == id)
125                    .map(|(_, path)| path.to_string())
126                    .unwrap_or_default();
127
128                results.push(SearchResult::DocumentMatch {
129                    id,
130                    path,
131                    content_matches: vec![ContentMatch {
132                        paragraph: snippet.fragment().to_string(),
133                        matched_indices: Self::highlight_to_matches(snippet.highlighted()),
134                        score: 0,
135                    }],
136                });
137            }
138        }
139        Ok(results)
140    }
141
142    fn highlight_to_matches(ranges: &[Range<usize>]) -> Vec<usize> {
143        let mut matches = vec![];
144        for range in ranges {
145            for i in range.clone() {
146                matches.push(i);
147            }
148        }
149
150        matches
151    }
152
153    #[instrument(level = "debug", skip(self), err(Debug))]
154    pub async fn build_index(&self) -> LbResult<()> {
155        // if we haven't signed in yet, we'll leave our index entry and our event subscriber will
156        // handle the state change
157        if self.keychain.get_account().is_err() {
158            return Ok(());
159        }
160
161        let metadata_index = SearchMetadata::populate(self).await?;
162        *self.search.metadata_index.write().await = metadata_index.clone();
163        self.update_tantivy(vec![], metadata_index.files.iter().map(|f| f.id).collect())
164            .await;
165
166        Ok(())
167    }
168
169    #[instrument(level = "debug", skip(self))]
170    pub fn setup_search(&self) {
171        if self.config.background_work {
172            let lb = self.clone();
173            let mut rx = self.subscribe();
174            tokio::spawn(async move {
175                lb.build_index().await.unwrap();
176                loop {
177                    let evt = match rx.recv().await {
178                        Ok(evt) => evt,
179                        Err(err) => {
180                            error!("failed to receive from a channel {err}");
181                            return;
182                        }
183                    };
184
185                    match evt {
186                        Event::MetadataChanged => {
187                            if let Some(replacement_index) =
188                                SearchMetadata::populate(&lb).await.log_and_ignore()
189                            {
190                                let current_index = lb.search.metadata_index.read().await.clone();
191                                let deleted_ids = replacement_index.compute_deleted(&current_index);
192                                *lb.search.metadata_index.write().await = replacement_index;
193                                lb.update_tantivy(vec![], deleted_ids).await;
194                            }
195                        }
196                        Event::DocumentWritten(id, _) => {
197                            lb.update_tantivy(vec![id], vec![id]).await;
198                        }
199                        _ => {}
200                    };
201                }
202            });
203        }
204    }
205
206    async fn update_tantivy(&self, delete: Vec<Uuid>, add: Vec<Uuid>) {
207        let mut index_writer: IndexWriter = self.search.tantivy_index.writer(50_000_000).unwrap();
208        let schema = self.search.tantivy_index.schema();
209        let id_field = schema.get_field("id").unwrap();
210        let id_str = schema.get_field("id_str").unwrap();
211        let content = schema.get_field("content").unwrap();
212
213        for id in delete {
214            let term = Term::from_field_text(id_str, &id.to_string());
215            index_writer.delete_term(term);
216        }
217
218        for id in add {
219            let id_bytes = id.as_bytes().as_slice();
220            let id_string = id.to_string();
221            let Some(file) = self
222                .search
223                .metadata_index
224                .read()
225                .await
226                .files
227                .iter()
228                .find(|f| f.id == id)
229                .cloned()
230            else {
231                continue;
232            };
233
234            if !file.name.ends_with(".md") || file.is_folder() {
235                continue;
236            };
237
238            let doc = String::from_utf8(self.read_document(file.id, false).await.unwrap()).unwrap();
239
240            if doc.len() > CONTENT_MAX_LEN_BYTES {
241                continue;
242            };
243
244            index_writer
245                .add_document(doc!(
246                    id_field => id_bytes,
247                    id_str => id_string,
248                    content => doc,
249                ))
250                .unwrap();
251        }
252
253        index_writer.commit().unwrap();
254    }
255}
256
257impl Default for SearchIndex {
258    fn default() -> Self {
259        let mut schema_builder = Schema::builder();
260        schema_builder.add_bytes_field("id", STORED);
261        schema_builder.add_text_field("id_str", TEXT | STORED);
262        schema_builder.add_text_field("content", TEXT | STORED);
263
264        let schema = schema_builder.build();
265
266        let index = Index::create_in_ram(schema.clone());
267
268        // doing this here would be a bad idea if not for in-ram empty index
269        let reader = index
270            .reader_builder()
271            .reload_policy(ReloadPolicy::OnCommitWithDelay)
272            .try_into()
273            .unwrap();
274
275        Self {
276            ready: Default::default(),
277            tantivy_index: index,
278            tantivy_reader: reader,
279            metadata_index: Default::default(),
280        }
281    }
282}
283
284#[derive(Debug, Serialize)]
285pub struct ContentMatch {
286    pub paragraph: String,
287    pub matched_indices: Vec<usize>,
288    pub score: i64,
289}
290
291impl SearchResult {
292    pub fn id(&self) -> Uuid {
293        match self {
294            SearchResult::DocumentMatch { id, .. } | SearchResult::PathMatch { id, .. } => *id,
295        }
296    }
297
298    pub fn path(&self) -> &str {
299        match self {
300            SearchResult::DocumentMatch { path, .. } | SearchResult::PathMatch { path, .. } => path,
301        }
302    }
303
304    pub fn name(&self) -> &str {
305        match self {
306            SearchResult::DocumentMatch { path, .. } | SearchResult::PathMatch { path, .. } => {
307                path.split('/').next_back().unwrap_or_default()
308            }
309        }
310    }
311
312    pub fn score(&self) -> i64 {
313        match self {
314            SearchResult::DocumentMatch { content_matches, .. } => content_matches
315                .iter()
316                .map(|m| m.score)
317                .max()
318                .unwrap_or_default(),
319            SearchResult::PathMatch { score, .. } => *score,
320        }
321    }
322}
323
324#[derive(Default, Clone)]
325pub struct SearchMetadata {
326    files: Vec<File>,
327    paths: Vec<(Uuid, String)>,
328    suggested_docs: Vec<Uuid>,
329}
330
331impl SearchMetadata {
332    async fn populate(lb: &Lb) -> LbResult<Self> {
333        let files = lb.list_metadatas().await?;
334        let paths = lb.list_paths_with_ids(None).await?;
335        let suggested_docs = lb.suggested_docs(RankingWeights::default()).await?;
336
337        Ok(SearchMetadata { files, paths, suggested_docs })
338    }
339
340    fn compute_deleted(&self, old: &SearchMetadata) -> Vec<Uuid> {
341        let mut deleted_ids = vec![];
342
343        for old_file in &old.files {
344            if !self.files.iter().any(|new_f| new_f.id == old_file.id) {
345                deleted_ids.push(old_file.id);
346            }
347        }
348
349        deleted_ids
350    }
351
352    fn empty_search(&self) -> LbResult<Vec<SearchResult>> {
353        let mut results = vec![];
354
355        for id in &self.suggested_docs {
356            let path = self
357                .paths
358                .iter()
359                .find(|(path_id, _)| id == path_id)
360                .map(|(_, path)| path.clone())
361                .unwrap_or_default();
362
363            results.push(SearchResult::PathMatch {
364                id: *id,
365                path,
366                matched_indices: vec![],
367                score: 0,
368            });
369        }
370
371        Ok(results)
372    }
373
374    fn path_search(&self, query: &str) -> LbResult<Vec<SearchResult>> {
375        let mut results = self.path_candidates(query)?;
376        self.score_paths(&mut results);
377
378        results.sort_by_key(|r| -r.score());
379
380        if let Some(result) = self.id_match(query) {
381            results.insert(0, result);
382        }
383
384        Ok(results)
385    }
386
387    fn id_match(&self, query: &str) -> Option<SearchResult> {
388        if query.len() < 8 {
389            return None;
390        }
391
392        let query = if query.starts_with("lb://") {
393            query.replacen("lb://", "", 1)
394        } else {
395            query.to_string()
396        };
397
398        for (id, path) in &self.paths {
399            if id.to_string().contains(&query) {
400                return Some(SearchResult::PathMatch {
401                    id: *id,
402                    path: path.clone(),
403                    matched_indices: vec![],
404                    score: 100,
405                });
406            }
407        }
408
409        None
410    }
411
412    fn path_candidates(&self, query: &str) -> LbResult<Vec<SearchResult>> {
413        let mut search_results = vec![];
414
415        for (id, path) in &self.paths {
416            let mut matched_indices = vec![];
417
418            let mut query_iter = query.chars().rev();
419            let mut current_query_char = query_iter.next();
420
421            for (path_ind, path_char) in path.char_indices().rev() {
422                if let Some(qc) = current_query_char {
423                    if qc.eq_ignore_ascii_case(&path_char) {
424                        matched_indices.push(path_ind);
425                        current_query_char = query_iter.next();
426                    }
427                } else {
428                    break;
429                }
430            }
431
432            if current_query_char.is_none() {
433                search_results.push(SearchResult::PathMatch {
434                    id: *id,
435                    path: path.clone(),
436                    matched_indices,
437                    score: 0,
438                });
439            }
440        }
441        Ok(search_results)
442    }
443
444    fn score_paths(&self, candidates: &mut [SearchResult]) {
445        // tunable bonuses for path search
446        let smaller_paths = 10;
447        let suggested = 10;
448        let filename = 30;
449        let editable = 3;
450
451        candidates.sort_by_key(|a| a.path().len());
452
453        // the 10 smallest paths start with a mild advantage
454        for i in 0..smaller_paths {
455            if let Some(SearchResult::PathMatch { id: _, path: _, matched_indices: _, score }) =
456                candidates.get_mut(i)
457            {
458                *score = (smaller_paths - i) as i64;
459            }
460        }
461
462        // items in suggested docs have their score boosted
463        for cand in candidates.iter_mut() {
464            if self.suggested_docs.contains(&cand.id()) {
465                if let SearchResult::PathMatch { id: _, path: _, matched_indices: _, score } = cand
466                {
467                    *score += suggested;
468                }
469            }
470        }
471
472        // to what extent is the match in the name of the file
473        for cand in candidates.iter_mut() {
474            if let SearchResult::PathMatch { id: _, path, matched_indices, score } = cand {
475                let mut name_match = 0;
476                let mut name_size = 0;
477
478                for (i, c) in path.char_indices().rev() {
479                    if c == '/' {
480                        break;
481                    }
482                    name_size += 1;
483                    if matched_indices.contains(&i) {
484                        name_match += 1;
485                    }
486                }
487
488                let match_portion = name_match as f32 / name_size.max(1) as f32;
489                *score += (match_portion * filename as f32) as i64;
490            }
491        }
492
493        // if this document is editable in platform
494        for cand in candidates.iter_mut() {
495            if let SearchResult::PathMatch { id: _, path, matched_indices: _, score } = cand {
496                if path.ends_with(".md") || path.ends_with(".svg") {
497                    *score += editable;
498                }
499            }
500        }
501    }
502}