Skip to main content

lb_rs/subscribers/
search.rs

1use crate::model::errors::{LbResult, Unexpected};
2use crate::model::file::File;
3use crate::service::activity::RankingWeights;
4use crate::service::events::Event;
5use crate::{LocalLb, tokio_spawn};
6use serde::{Deserialize, Serialize};
7use std::ops::Range;
8use std::sync::Arc;
9use std::sync::atomic::AtomicBool;
10use tantivy::collector::TopDocs;
11use tantivy::query::QueryParser;
12use tantivy::schema::{INDEXED, STORED, Schema, TEXT, Value};
13use tantivy::snippet::SnippetGenerator;
14use tantivy::{Index, IndexReader, IndexWriter, ReloadPolicy, TantivyDocument, Term, doc};
15use tokio::sync::RwLock;
16use uuid::Uuid;
17
18const CONTENT_MAX_LEN_BYTES: usize = 128 * 1024; // 128kb
19
20#[derive(Clone)]
21pub struct SearchIndex {
22    pub ready: Arc<AtomicBool>,
23
24    pub metadata_index: Arc<RwLock<SearchMetadata>>,
25    pub tantivy_index: Index,
26    pub tantivy_reader: IndexReader,
27}
28
29#[derive(Copy, Clone, Debug, Serialize, Deserialize)]
30pub enum SearchConfig {
31    Paths,
32    Documents,
33    PathsAndDocuments,
34}
35
36#[derive(Debug, Serialize, Deserialize)]
37pub enum SearchResult {
38    DocumentMatch { id: Uuid, path: String, content_matches: Vec<ContentMatch> },
39    PathMatch { id: Uuid, path: String, matched_indices: Vec<usize>, score: i64 },
40}
41
42impl LocalLb {
43    /// Lockbook's search implementation.
44    ///
45    /// Takes an input and a configuration. The configuration describes whether we are searching
46    /// paths, documents or both.
47    ///
48    /// Document searches are handled by [tantivy](https://github.com/quickwit-oss/tantivy), and as
49    /// such support [tantivy's advanced query
50    /// syntax](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html).
51    /// In the future we plan to ingest a bunch of metadata and expose a full advanced search mode.
52    ///
53    /// Path searches are implemented as a subsequence filter with a number of hueristics to sort
54    /// the results. Preference is given to shorter paths, filename matches, suggested docs, and
55    /// documents that are editable in platform.
56    ///
57    /// Additionally if a path search contains a string, greater than 8 characters long that is
58    /// contained within any of the paths in the search index, that result is returned with the
59    /// highest score. lb:// style ids are also supported.
60    #[instrument(level = "debug", skip(self, input), err(Debug))]
61    pub async fn search(&self, input: &str, cfg: SearchConfig) -> LbResult<Vec<SearchResult>> {
62        // show suggested docs if the input string is empty
63        if input.is_empty() {
64            return self.search.metadata_index.read().await.empty_search();
65        }
66
67        match cfg {
68            SearchConfig::Paths => {
69                let mut results = self.search.metadata_index.read().await.path_search(input)?;
70                results.truncate(5);
71                Ok(results)
72            }
73            SearchConfig::Documents => {
74                let mut results = self.search_content(input).await?;
75                results.truncate(10);
76                Ok(results)
77            }
78            SearchConfig::PathsAndDocuments => {
79                let mut results = self.search.metadata_index.read().await.path_search(input)?;
80                results.truncate(4);
81                results.append(&mut self.search_content(input).await?);
82                Ok(results)
83            }
84        }
85    }
86
87    async fn search_content(&self, input: &str) -> LbResult<Vec<SearchResult>> {
88        let searcher = self.search.tantivy_reader.searcher();
89        let schema = self.search.tantivy_index.schema();
90        let id_field = schema.get_field("id").unwrap();
91        let content = schema.get_field("content").unwrap();
92
93        let query_parser = QueryParser::for_index(&self.search.tantivy_index, vec![content]);
94        let mut results = vec![];
95
96        if let Ok(query) = query_parser.parse_query(input) {
97            let mut snippet_generator =
98                SnippetGenerator::create(&searcher, &query, content).map_unexpected()?;
99            snippet_generator.set_max_num_chars(100);
100
101            let top_docs = searcher
102                .search(&query, &TopDocs::with_limit(10))
103                .map_unexpected()?;
104
105            for (_score, doc_address) in top_docs {
106                let retrieved_doc: TantivyDocument = searcher.doc(doc_address).map_unexpected()?;
107                let id = Uuid::from_slice(
108                    retrieved_doc
109                        .get_first(id_field)
110                        .map(|val| val.as_bytes().unwrap_or_default())
111                        .unwrap_or_default(),
112                )
113                .map_unexpected()?;
114
115                let snippet = snippet_generator.snippet_from_doc(&retrieved_doc);
116                let path = self
117                    .search
118                    .metadata_index
119                    .read()
120                    .await
121                    .paths
122                    .iter()
123                    .find(|(path_id, _)| *path_id == id)
124                    .map(|(_, path)| path.to_string())
125                    .unwrap_or_default();
126
127                results.push(SearchResult::DocumentMatch {
128                    id,
129                    path,
130                    content_matches: vec![ContentMatch {
131                        paragraph: snippet.fragment().to_string(),
132                        matched_indices: Self::highlight_to_matches(snippet.highlighted()),
133                        score: 0,
134                    }],
135                });
136            }
137        }
138        Ok(results)
139    }
140
141    fn highlight_to_matches(ranges: &[Range<usize>]) -> Vec<usize> {
142        let mut matches = vec![];
143        for range in ranges {
144            for i in range.clone() {
145                matches.push(i);
146            }
147        }
148
149        matches
150    }
151
152    pub fn reload_search_index(&self) -> LbResult<()> {
153        self.search
154            .tantivy_reader
155            .reload()
156            .map_err(|e| crate::LbErrKind::Unexpected(format!("tantivy reload: {e}")).into())
157    }
158
159    #[instrument(level = "debug", skip(self), err(Debug))]
160    pub async fn build_index(&self) -> LbResult<()> {
161        // if we haven't signed in yet, we'll leave our index entry and our event subscriber will
162        // handle the state change
163        if self.keychain.get_account().is_err() {
164            return Ok(());
165        }
166
167        let new_metadata = SearchMetadata::populate(self).await?;
168
169        let (deleted_ids, all_current_ids) = {
170            let mut current_metadata = self.search.metadata_index.write().await;
171            let deleted = new_metadata.compute_deleted(&current_metadata);
172            let current = new_metadata.files.iter().map(|f| f.id).collect::<Vec<_>>();
173            *current_metadata = new_metadata;
174            (deleted, current)
175        };
176
177        self.update_tantivy(deleted_ids, all_current_ids).await;
178
179        Ok(())
180    }
181
182    #[instrument(level = "debug", skip(self))]
183    pub fn setup_search(&self) {
184        if self.config.background_work {
185            let lb = self.clone();
186            let mut rx = self.subscribe();
187            tokio_spawn!(async move {
188                lb.build_index().await.unwrap();
189                loop {
190                    let evt = match rx.recv().await {
191                        Ok(evt) => evt,
192                        Err(err) => {
193                            error!("failed to receive from a channel {err}");
194                            return;
195                        }
196                    };
197
198                    match evt {
199                        Event::UserSignedIn => {
200                            lb.build_index().await.log_and_ignore();
201                        }
202                        Event::MetadataChanged(_) => {
203                            if let Some(replacement_index) =
204                                SearchMetadata::populate(&lb).await.log_and_ignore()
205                            {
206                                let current_index = lb.search.metadata_index.read().await.clone();
207                                let deleted_ids = replacement_index.compute_deleted(&current_index);
208                                *lb.search.metadata_index.write().await = replacement_index;
209                                lb.update_tantivy(deleted_ids, vec![]).await;
210                            }
211                        }
212                        Event::DocumentWritten(id, _) => {
213                            lb.update_tantivy(vec![id], vec![id]).await;
214                        }
215                        _ => {}
216                    };
217                }
218            });
219        }
220    }
221
222    async fn update_tantivy(&self, delete: Vec<Uuid>, add: Vec<Uuid>) {
223        let mut index_writer: IndexWriter = self.search.tantivy_index.writer(50_000_000).unwrap();
224        let schema = self.search.tantivy_index.schema();
225        let id_field = schema.get_field("id").unwrap();
226        let id_str = schema.get_field("id_str").unwrap();
227        let content = schema.get_field("content").unwrap();
228
229        for id in delete {
230            let term = Term::from_field_bytes(id_field, id.as_bytes());
231            index_writer.delete_term(term);
232        }
233
234        for id in add {
235            let id_bytes = id.as_bytes().as_slice();
236            let id_string = id.to_string();
237            let Some(file) = self
238                .search
239                .metadata_index
240                .read()
241                .await
242                .files
243                .iter()
244                .find(|f| f.id == id)
245                .cloned()
246            else {
247                continue;
248            };
249
250            if !file.name.ends_with(".md") || file.is_folder() {
251                continue;
252            };
253
254            let Ok(doc) = self.read_document(file.id, false).await else {
255                error!("failed to read doc");
256                continue;
257            };
258
259            if doc.len() > CONTENT_MAX_LEN_BYTES {
260                continue;
261            };
262
263            let Ok(doc) = String::from_utf8(doc) else {
264                continue;
265            };
266
267            index_writer
268                .add_document(doc!(
269                    id_field => id_bytes,
270                    id_str => id_string,
271                    content => doc,
272                ))
273                .unwrap();
274        }
275
276        index_writer.commit().unwrap();
277    }
278}
279
280impl Default for SearchIndex {
281    fn default() -> Self {
282        let mut schema_builder = Schema::builder();
283        schema_builder.add_bytes_field("id", INDEXED | STORED);
284        schema_builder.add_text_field("id_str", TEXT | STORED);
285        schema_builder.add_text_field("content", TEXT | STORED);
286
287        let schema = schema_builder.build();
288
289        let index = Index::create_in_ram(schema.clone());
290
291        // doing this here would be a bad idea if not for in-ram empty index
292        let reader = index
293            .reader_builder()
294            .reload_policy(ReloadPolicy::OnCommitWithDelay)
295            .try_into()
296            .unwrap();
297
298        Self {
299            ready: Default::default(),
300            tantivy_index: index,
301            tantivy_reader: reader,
302            metadata_index: Default::default(),
303        }
304    }
305}
306
307#[derive(Debug, Serialize, Deserialize)]
308pub struct ContentMatch {
309    pub paragraph: String,
310    pub matched_indices: Vec<usize>,
311    pub score: i64,
312}
313
314impl SearchResult {
315    pub fn id(&self) -> Uuid {
316        match self {
317            SearchResult::DocumentMatch { id, .. } | SearchResult::PathMatch { id, .. } => *id,
318        }
319    }
320
321    pub fn path(&self) -> &str {
322        match self {
323            SearchResult::DocumentMatch { path, .. } | SearchResult::PathMatch { path, .. } => path,
324        }
325    }
326
327    pub fn name(&self) -> &str {
328        match self {
329            SearchResult::DocumentMatch { path, .. } | SearchResult::PathMatch { path, .. } => {
330                path.split('/').next_back().unwrap_or_default()
331            }
332        }
333    }
334
335    pub fn score(&self) -> i64 {
336        match self {
337            SearchResult::DocumentMatch { content_matches, .. } => content_matches
338                .iter()
339                .map(|m| m.score)
340                .max()
341                .unwrap_or_default(),
342            SearchResult::PathMatch { score, .. } => *score,
343        }
344    }
345}
346
347#[derive(Default, Clone)]
348pub struct SearchMetadata {
349    files: Vec<File>,
350    paths: Vec<(Uuid, String)>,
351    suggested_docs: Vec<Uuid>,
352}
353
354impl SearchMetadata {
355    async fn populate(lb: &LocalLb) -> LbResult<Self> {
356        let files = lb.list_metadatas().await?;
357        let paths = lb.list_paths_with_ids(None).await?;
358        let suggested_docs = lb.suggested_docs(RankingWeights::default()).await?;
359
360        Ok(SearchMetadata { files, paths, suggested_docs })
361    }
362
363    fn compute_deleted(&self, old: &SearchMetadata) -> Vec<Uuid> {
364        let mut deleted_ids = vec![];
365
366        for old_file in &old.files {
367            if !self.files.iter().any(|new_f| new_f.id == old_file.id) {
368                deleted_ids.push(old_file.id);
369            }
370        }
371
372        deleted_ids
373    }
374
375    fn empty_search(&self) -> LbResult<Vec<SearchResult>> {
376        let mut results = vec![];
377
378        for id in &self.suggested_docs {
379            let path = self
380                .paths
381                .iter()
382                .find(|(path_id, _)| id == path_id)
383                .map(|(_, path)| path.clone())
384                .unwrap_or_default();
385
386            results.push(SearchResult::PathMatch {
387                id: *id,
388                path,
389                matched_indices: vec![],
390                score: 0,
391            });
392        }
393
394        Ok(results)
395    }
396
397    fn path_search(&self, query: &str) -> LbResult<Vec<SearchResult>> {
398        let mut results = self.path_candidates(query)?;
399        self.score_paths(&mut results);
400
401        results.sort_by_key(|r| -r.score());
402
403        if let Some(result) = self.id_match(query) {
404            results.insert(0, result);
405        }
406
407        Ok(results)
408    }
409
410    fn id_match(&self, query: &str) -> Option<SearchResult> {
411        if query.len() < 8 {
412            return None;
413        }
414
415        let query = if query.starts_with("lb://") {
416            query.replacen("lb://", "", 1)
417        } else {
418            query.to_string()
419        };
420
421        for (id, path) in &self.paths {
422            if id.to_string().contains(&query) {
423                return Some(SearchResult::PathMatch {
424                    id: *id,
425                    path: path.clone(),
426                    matched_indices: vec![],
427                    score: 100,
428                });
429            }
430        }
431
432        None
433    }
434
435    fn path_candidates(&self, query: &str) -> LbResult<Vec<SearchResult>> {
436        let mut search_results = vec![];
437
438        for (id, path) in &self.paths {
439            let mut matched_indices = vec![];
440
441            let mut query_iter = query.chars().rev();
442            let mut current_query_char = query_iter.next();
443
444            for (path_ind, path_char) in path.char_indices().rev() {
445                if let Some(qc) = current_query_char {
446                    if qc.eq_ignore_ascii_case(&path_char) {
447                        matched_indices.push(path_ind);
448                        current_query_char = query_iter.next();
449                    }
450                } else {
451                    break;
452                }
453            }
454
455            if current_query_char.is_none() {
456                search_results.push(SearchResult::PathMatch {
457                    id: *id,
458                    path: path.clone(),
459                    matched_indices,
460                    score: 0,
461                });
462            }
463        }
464        Ok(search_results)
465    }
466
467    fn score_paths(&self, candidates: &mut [SearchResult]) {
468        // tunable bonuses for path search
469        let smaller_paths = 10;
470        let suggested = 10;
471        let filename = 30;
472        let editable = 3;
473
474        candidates.sort_by_key(|a| a.path().len());
475
476        // the 10 smallest paths start with a mild advantage
477        for i in 0..smaller_paths {
478            if let Some(SearchResult::PathMatch { id: _, path: _, matched_indices: _, score }) =
479                candidates.get_mut(i)
480            {
481                *score = (smaller_paths - i) as i64;
482            }
483        }
484
485        // items in suggested docs have their score boosted
486        for cand in candidates.iter_mut() {
487            if self.suggested_docs.contains(&cand.id()) {
488                if let SearchResult::PathMatch { id: _, path: _, matched_indices: _, score } = cand
489                {
490                    *score += suggested;
491                }
492            }
493        }
494
495        // to what extent is the match in the name of the file
496        for cand in candidates.iter_mut() {
497            if let SearchResult::PathMatch { id: _, path, matched_indices, score } = cand {
498                let mut name_match = 0;
499                let mut name_size = 0;
500
501                for (i, c) in path.char_indices().rev() {
502                    if c == '/' {
503                        break;
504                    }
505                    name_size += 1;
506                    if matched_indices.contains(&i) {
507                        name_match += 1;
508                    }
509                }
510
511                let match_portion = name_match as f32 / name_size.max(1) as f32;
512                *score += (match_portion * filename as f32) as i64;
513            }
514        }
515
516        // if this document is editable in platform
517        for cand in candidates.iter_mut() {
518            if let SearchResult::PathMatch { id: _, path, matched_indices: _, score } = cand {
519                if path.ends_with(".md") || path.ends_with(".svg") {
520                    *score += editable;
521                }
522            }
523        }
524    }
525}