homoglyph_driver/
tantivy.rs1use std::fs::{create_dir_all, read_to_string, File};
2use std::io::Write;
3use std::path::PathBuf;
4use std::str::FromStr;
5
6use homoglyph_core::confusable;
7use homoglyph_core::domain::{SentenceDomain, WordDomain};
8use homoglyph_core::glyph::EncodedGlyph;
9use homoglyph_core::sentence::EncodedSentence;
10use homoglyph_core::word::EncodedWord;
11
12use tantivy::collector::TopDocs;
13use tantivy::directory::{MmapDirectory, RamDirectory};
14use tantivy::query::{Query, QueryParser};
15use tantivy::schema::{STORED, TEXT};
16use tantivy::{doc, IndexSettings, ReloadPolicy};
17use tantivy::{schema::Schema, Index};
18
19use crate::SearchEngine;
20
21#[allow(dead_code)]
22pub struct Tantivy {
23 index: Index,
24 schema: Schema,
25 queries_by_domain: Vec<Vec<Box<dyn Query>>>,
26}
27
28static HOMOGLYPHS_DIR: &'static str = "/tmp/homoglyphs";
29static HOMOGLYPHS_STATE_FILE: &'static str = "/tmp/homoglyphs/.state";
30
31fn create_schema() -> Schema {
32 let mut schema_builder = Schema::builder();
33 schema_builder.add_text_field("glyph", TEXT | STORED);
34 schema_builder.build()
35}
36
37fn build_mmap_path() -> PathBuf {
38 let mut path = PathBuf::new();
39 path.push(HOMOGLYPHS_DIR);
40 path
41}
42
43fn create_managed_index(schema: &Schema) -> Index {
44 let homoglyphs_dir = build_mmap_path();
45 if !homoglyphs_dir.exists() {
46 create_dir_all(&homoglyphs_dir).unwrap();
47 }
48
49 let mmap = MmapDirectory::open(homoglyphs_dir).unwrap();
50 let index = Index::open_or_create(mmap.clone(), schema.to_owned()).unwrap();
51 index
52}
53
54#[allow(dead_code)]
55fn create_in_ram_index(ram_directory: RamDirectory, schema: &Schema) -> Index {
56 Index::create(
57 ram_directory.to_owned(),
58 schema.to_owned(),
59 IndexSettings::default(),
60 )
61 .unwrap()
62}
63
64#[allow(dead_code)]
68impl SearchEngine for Tantivy {
69 fn init() -> Self {
70 let schema = create_schema();
71
72 let index = create_managed_index(&schema);
73
74 Self {
75 index,
76 schema,
77 queries_by_domain: Vec::<Vec<Box<dyn Query>>>::new(),
78 }
79 }
80 fn new() -> Self {
81 let mut tantivy = Self::init();
82
83 let contents = read_to_string(HOMOGLYPHS_STATE_FILE);
84
85 let indexed_state = match contents {
86 Ok(p) => p, Err(_) => "0".to_string(), };
89
90 if i32::from_str(indexed_state.as_str()).unwrap() == 0 {
91 tantivy.index();
92 }
93 tantivy
94 }
95
96 fn index(&mut self) {
97 let confusable = confusable::confusable::HEX_FILE;
98 let glyph = self.schema.get_field("glyph").unwrap();
99
100 let mut index_writer = self.index.writer(50_000_000).unwrap();
101
102 for line in confusable.lines() {
103 index_writer.add_document(doc!(glyph => line)).unwrap();
104 }
105
106 index_writer.commit().unwrap();
107
108 let mut file = File::create("/tmp/homoglyphs/.state").unwrap();
109 file.write_all(b"1").unwrap();
110 }
111
112 fn query(&mut self, mut sentence_enc: EncodedSentence) {
113 let glyph = self.schema.get_field("glyph").unwrap();
114 let query_parser = QueryParser::for_index(&self.index, vec![glyph]);
115
116 for c in sentence_enc.iter() {
117 let mut queries = Vec::new();
118 for cc in c.iter() {
119 let query = query_parser.parse_query(cc.0.as_str()).unwrap();
120 queries.push(query);
121 }
122 self.queries_by_domain.push(queries)
123 }
124 }
125
126 fn search(&mut self) -> SentenceDomain {
127 let reader = self
128 .index
129 .reader_builder()
130 .reload_policy(ReloadPolicy::OnCommit)
131 .try_into()
132 .unwrap();
133 let searcher = reader.searcher();
134
135 let mut sentence_domain: Vec<WordDomain> = Vec::new();
136
137 for queries in &self.queries_by_domain {
139 let mut world_domain: Vec<EncodedWord> = Vec::new();
141
142 for query in queries {
143 let mut confusable_word: Vec<EncodedWord> = Vec::new();
144 let top_docs = searcher.search(query, &TopDocs::with_limit(1)).unwrap();
145 let glyph = self.schema.get_field("glyph").unwrap();
146
147 for (_score, doc_address) in top_docs {
149 let retrieved_doc = searcher.doc(doc_address).unwrap();
150 let value = retrieved_doc.get_all(glyph).into_iter().next().unwrap();
151 let slice = value.as_text().unwrap().split_terminator(",");
152
153 let mut domain_words_enc: Vec<EncodedGlyph> = Vec::new();
154
155 for s in slice {
157 let confusable_glyph_enc = EncodedGlyph::from_str(s.trim()).unwrap();
158 domain_words_enc.push(confusable_glyph_enc);
159 }
160 let encode_word = EncodedWord::new(domain_words_enc);
161 confusable_word.push(encode_word);
162 }
163 world_domain.append(&mut confusable_word);
164 }
165 sentence_domain.push(WordDomain::new(world_domain));
166 }
167 SentenceDomain::new(sentence_domain)
168 }
169}
170
171#[cfg(test)]
172mod tests {
173 use super::*;
174 use homoglyph_core::sentence::{EncodedSentence, Sentence};
175
176 #[test]
177 fn when_init_then_create_resource() {
178 let mut tantivy = Tantivy::init();
179 let sentence_dec = Sentence::from_str("ru best").unwrap();
180 let sentence_dec: EncodedSentence = EncodedSentence::from(sentence_dec);
181 tantivy.index();
182 tantivy.query(sentence_dec);
183 let domain: SentenceDomain = tantivy.search();
184 println!("{:#?}", domain);
185 }
186}