homoglyph_driver/
tantivy.rs

1use std::fs::{create_dir_all, read_to_string, File};
2use std::io::Write;
3use std::path::PathBuf;
4use std::str::FromStr;
5
6use homoglyph_core::confusable;
7use homoglyph_core::domain::{SentenceDomain, WordDomain};
8use homoglyph_core::glyph::EncodedGlyph;
9use homoglyph_core::sentence::EncodedSentence;
10use homoglyph_core::word::EncodedWord;
11
12use tantivy::collector::TopDocs;
13use tantivy::directory::{MmapDirectory, RamDirectory};
14use tantivy::query::{Query, QueryParser};
15use tantivy::schema::{STORED, TEXT};
16use tantivy::{doc, IndexSettings, ReloadPolicy};
17use tantivy::{schema::Schema, Index};
18
19use crate::SearchEngine;
20
21#[allow(dead_code)]
22pub struct Tantivy {
23    index: Index,
24    schema: Schema,
25    queries_by_domain: Vec<Vec<Box<dyn Query>>>,
26}
27
28static HOMOGLYPHS_DIR: &'static str = "/tmp/homoglyphs";
29static HOMOGLYPHS_STATE_FILE: &'static str = "/tmp/homoglyphs/.state";
30
31fn create_schema() -> Schema {
32    let mut schema_builder = Schema::builder();
33    schema_builder.add_text_field("glyph", TEXT | STORED);
34    schema_builder.build()
35}
36
37fn build_mmap_path() -> PathBuf {
38    let mut path = PathBuf::new();
39    path.push(HOMOGLYPHS_DIR);
40    path
41}
42
43fn create_managed_index(schema: &Schema) -> Index {
44    let homoglyphs_dir = build_mmap_path();
45    if !homoglyphs_dir.exists() {
46        create_dir_all(&homoglyphs_dir).unwrap();
47    }
48
49    let mmap = MmapDirectory::open(homoglyphs_dir).unwrap();
50    let index = Index::open_or_create(mmap.clone(), schema.to_owned()).unwrap();
51    index
52}
53
54#[allow(dead_code)]
55fn create_in_ram_index(ram_directory: RamDirectory, schema: &Schema) -> Index {
56    Index::create(
57        ram_directory.to_owned(),
58        schema.to_owned(),
59        IndexSettings::default(),
60    )
61    .unwrap()
62}
63
64// TODO: Implement a Config Pattern and a State Pattern
65// TODO: Add inventory tracking and hash checking on living file in new
66// TODO: Add Garbage collect
67#[allow(dead_code)]
68impl SearchEngine for Tantivy {
69    fn init() -> Self {
70        let schema = create_schema();
71
72        let index = create_managed_index(&schema);
73
74        Self {
75            index,
76            schema,
77            queries_by_domain: Vec::<Vec<Box<dyn Query>>>::new(),
78        }
79    }
80    fn new() -> Self {
81        let mut tantivy = Self::init();
82
83        let contents = read_to_string(HOMOGLYPHS_STATE_FILE);
84
85        let indexed_state = match contents {
86            Ok(p) => p,                // indexed
87            Err(_) => "0".to_string(), // not indexed
88        };
89
90        if i32::from_str(indexed_state.as_str()).unwrap() == 0 {
91            tantivy.index();
92        }
93        tantivy
94    }
95
96    fn index(&mut self) {
97        let confusable = confusable::confusable::HEX_FILE;
98        let glyph = self.schema.get_field("glyph").unwrap();
99
100        let mut index_writer = self.index.writer(50_000_000).unwrap();
101
102        for line in confusable.lines() {
103            index_writer.add_document(doc!(glyph => line)).unwrap();
104        }
105
106        index_writer.commit().unwrap();
107
108        let mut file = File::create("/tmp/homoglyphs/.state").unwrap();
109        file.write_all(b"1").unwrap();
110    }
111
112    fn query(&mut self, mut sentence_enc: EncodedSentence) {
113        let glyph = self.schema.get_field("glyph").unwrap();
114        let query_parser = QueryParser::for_index(&self.index, vec![glyph]);
115
116        for c in sentence_enc.iter() {
117            let mut queries = Vec::new();
118            for cc in c.iter() {
119                let query = query_parser.parse_query(cc.0.as_str()).unwrap();
120                queries.push(query);
121            }
122            self.queries_by_domain.push(queries)
123        }
124    }
125
126    fn search(&mut self) -> SentenceDomain {
127        let reader = self
128            .index
129            .reader_builder()
130            .reload_policy(ReloadPolicy::OnCommit)
131            .try_into()
132            .unwrap();
133        let searcher = reader.searcher();
134
135        let mut sentence_domain: Vec<WordDomain> = Vec::new();
136
137        // vec of queries for each word
138        for queries in &self.queries_by_domain {
139            // queries of confusable in each word
140            let mut world_domain: Vec<EncodedWord> = Vec::new();
141
142            for query in queries {
143                let mut confusable_word: Vec<EncodedWord> = Vec::new();
144                let top_docs = searcher.search(query, &TopDocs::with_limit(1)).unwrap();
145                let glyph = self.schema.get_field("glyph").unwrap();
146
147                // Found confusable for each query of each word
148                for (_score, doc_address) in top_docs {
149                    let retrieved_doc = searcher.doc(doc_address).unwrap();
150                    let value = retrieved_doc.get_all(glyph).into_iter().next().unwrap();
151                    let slice = value.as_text().unwrap().split_terminator(",");
152
153                    let mut domain_words_enc: Vec<EncodedGlyph> = Vec::new();
154
155                    // Encoded each confusable found for each word
156                    for s in slice {
157                        let confusable_glyph_enc = EncodedGlyph::from_str(s.trim()).unwrap();
158                        domain_words_enc.push(confusable_glyph_enc);
159                    }
160                    let encode_word = EncodedWord::new(domain_words_enc);
161                    confusable_word.push(encode_word);
162                }
163                world_domain.append(&mut confusable_word);
164            }
165            sentence_domain.push(WordDomain::new(world_domain));
166        }
167        SentenceDomain::new(sentence_domain)
168    }
169}
170
171#[cfg(test)]
172mod tests {
173    use super::*;
174    use homoglyph_core::sentence::{EncodedSentence, Sentence};
175
176    #[test]
177    fn when_init_then_create_resource() {
178        let mut tantivy = Tantivy::init();
179        let sentence_dec = Sentence::from_str("ru best").unwrap();
180        let sentence_dec: EncodedSentence = EncodedSentence::from(sentence_dec);
181        tantivy.index();
182        tantivy.query(sentence_dec);
183        let domain: SentenceDomain = tantivy.search();
184        println!("{:#?}", domain);
185    }
186}