quickner/
quickner.rs

1use crate::{
2    config::{Config, Filters},
3    models::Text,
4    utils::{char_to_byte, get_progress_bar, is_valid_utf8},
5    SpacyEntity,
6};
7use aho_corasick::AhoCorasick;
8use log::{error, info, warn};
9use rayon::prelude::*;
10use std::{collections::HashMap, path::Path, sync::Arc};
11use std::{
12    collections::HashSet,
13    fs::File,
14    io::{BufRead, BufReader},
15};
16use std::{env, error::Error};
17
18use crate::document::Document;
19use crate::entity::Entity;
20
21/// Quickner is the main struct of the application
22/// It holds the configuration file and the path to the configuration file
23#[derive(Clone)]
24pub struct Quickner {
25    /// Path to the configuration file
26    /// Default: ./config.toml
27    pub config: Config,
28    pub config_file: Option<String>,
29    pub documents: Vec<Document>,
30    pub entities: Vec<Entity>,
31    pub documents_hash: HashMap<String, Document>,
32    pub documents_label_index: HashMap<String, Vec<String>>,
33    pub documents_entities_index: HashMap<String, Vec<String>>,
34}
35
36impl Default for Quickner {
37    fn default() -> Self {
38        Self {
39            config: Config::default(),
40            config_file: Some("./config.toml".to_string()),
41            documents: Vec::new(),
42            entities: Vec::new(),
43            documents_hash: HashMap::new(),
44            documents_label_index: HashMap::new(),
45            documents_entities_index: HashMap::new(),
46        }
47    }
48}
49
50impl Quickner {
51    /// Find the index of the entities in the text
52    /// # Arguments
53    /// * `text` - The text to search
54    /// * `entities` - The entities to search for
55    /// # Returns
56    /// * `Option<Vec<(usize, usize, String)>>` - The start and end index of the entity and the label
57    /// # Example
58    /// ```
59    /// use std::collections::HashSet;
60    /// use quickner::models::Entity;
61    ///
62    /// let text = "Rust is made by Mozilla".to_string();
63    /// let mut entities = HashSet::new();
64    /// entities.insert(Entity::new("Mozilla".to_string(), "ORG".to_string()));
65    /// let annotations = Annotations::find_index(text, entities);
66    /// assert_eq!(annotations, Some(vec![(15, 22, "ORG".to_string())]));
67    /// ```
68    pub(crate) fn find_index(
69        text: String,
70        entities: Vec<Entity>,
71    ) -> Option<Vec<(usize, usize, String)>> {
72        // let mut annotations = Vec::new();
73        let annotations = entities.iter().filter_map(|entity| {
74            let target_len = entity.name.len();
75            for (start, _) in text.match_indices(entity.name.as_str()) {
76                if start == 0
77                    || text
78                        .chars()
79                        .nth(start - 1)
80                        .unwrap_or_else(|| 'N')
81                        .is_whitespace()
82                    || text
83                        .chars()
84                        .nth(start - 1)
85                        .unwrap_or_else(|| 'N')
86                        .is_ascii_punctuation()
87                    || ((start + target_len) == text.len()
88                        || text
89                            .chars()
90                            .nth(start + target_len)
91                            .unwrap_or('N')
92                            .is_whitespace()
93                        || (text
94                            .chars()
95                            .nth(start + target_len)
96                            .unwrap_or('N')
97                            .is_ascii_punctuation()
98                            && text.chars().nth(start + target_len).unwrap() != '.'
99                            && (start > 0 && text.chars().nth(start - 1).unwrap() != '.')))
100                {
101                    return Some((start, start + target_len, entity.label.to_string()));
102                }
103            }
104            None
105        });
106        // Unique annotations
107        let mut annotations = annotations.collect::<Vec<(usize, usize, String)>>();
108        annotations.sort_by(|a, b| a.0.cmp(&b.0));
109        annotations.dedup();
110        // Sort annotations by start index
111        if !annotations.is_empty() {
112            Some(annotations)
113        } else {
114            None
115        }
116    }
117
118    pub(crate) fn find_index_using_aho_corasick(
119        text: &str,
120        aho_corasick: &Arc<AhoCorasick>,
121        entites: &Vec<Entity>,
122    ) -> Option<Vec<(usize, usize, String)>> {
123        if !is_valid_utf8(text) {
124            warn!("Skipping invalid utf8 text: \"{}\"", text);
125            return None;
126        }
127        let mut annotations = Vec::new();
128        for mat in aho_corasick.find_overlapping_iter(&text) {
129            let start = mat.start();
130            // convert byte index to char index (assuming utf8)
131            let start = text[..start].chars().count();
132            let end = mat.end();
133            let end = text[..end].chars().count();
134            let label = entites[mat.pattern()].label.to_string();
135            let name = entites[mat.pattern()].name.to_string();
136            let target_len = name.len();
137            if start == 0
138                && (text.chars().nth(end).unwrap_or('N').is_whitespace()
139                    || (text.chars().nth(end).unwrap_or('N').is_ascii_punctuation()))
140            {
141                annotations.push((start, end, label));
142                continue;
143            }
144            // if text == "monty python and the holy grail: the ultimate quiz http://bit.ly/pd3ms i got 42/50. can't believe i missed the name of lancelot's page " {
145            //     println!("Start: {}, End: {}, text_len: {}, End + 1: {}", start, end, text.len(), text.chars().nth(end + 1).unwrap_or('N'));
146            // }
147            // println!("Start: {}, End: {}, text_len: {}", start, end, char_len);
148            if start > 0
149                && text
150                    .chars()
151                    .nth(start - 1)
152                    .unwrap_or_else(|| 'N')
153                    .is_whitespace()
154                && (text.chars().nth(end).unwrap_or_else(|| 'N').is_whitespace()
155                    || text
156                        .chars()
157                        .nth(end)
158                        .unwrap_or_else(|| 'N')
159                        .is_ascii_punctuation())
160            {
161                annotations.push((start, end, label));
162                continue;
163            }
164            if start > 0
165                && text
166                    .chars()
167                    .nth(start - 1)
168                    .unwrap_or_else(|| 'N')
169                    .is_ascii_punctuation()
170                && (text.chars().nth(end).unwrap_or_else(|| 'N').is_whitespace()
171                    || text
172                        .chars()
173                        .nth(end)
174                        .unwrap_or_else(|| 'N')
175                        .is_ascii_punctuation())
176            {
177                annotations.push((start, end, label));
178                continue;
179            }
180            if (start + target_len) == text.len() {
181                annotations.push((start, end, label));
182                continue;
183            }
184            if (text
185                .chars()
186                .nth(start - 1)
187                .unwrap_or_else(|| 'N')
188                .is_ascii_punctuation()
189                || text
190                    .chars()
191                    .nth(start - 1)
192                    .unwrap_or_else(|| 'N')
193                    .is_whitespace())
194                && text
195                    .chars()
196                    .nth(start + target_len)
197                    .unwrap_or('N')
198                    .is_whitespace()
199            {
200                annotations.push((start, end, label));
201                continue;
202            }
203            if (text
204                .chars()
205                .nth(start - 1)
206                .unwrap_or_else(|| 'N')
207                .is_ascii_punctuation()
208                || text
209                    .chars()
210                    .nth(start - 1)
211                    .unwrap_or_else(|| 'N')
212                    .is_whitespace())
213                && text
214                    .chars()
215                    .nth(start + target_len)
216                    .unwrap_or('N')
217                    .is_ascii_punctuation()
218                && text.chars().nth(start + target_len).unwrap() != '.'
219                && (start > 0 && text.chars().nth(start - 1).unwrap() != '.')
220            {
221                annotations.push((start, end, label));
222            }
223        }
224        // Unique annotations
225        annotations.sort_by(|a, b| a.0.cmp(&b.0));
226        annotations.dedup();
227        // Sort annotations by start index
228        if !annotations.is_empty() {
229            Some(annotations)
230        } else {
231            None
232        }
233    }
234
235    /// Annotate the texts with the entities
236    /// # Example
237    /// ```
238    /// let mut annotations = Annotations::new(entities, texts);
239    /// annotations.annotate();
240    /// ```
241    /// # Panics
242    /// This function will panic if the texts are not loaded
243    /// # Performance
244    /// This function is parallelized using rayon
245    /// # Progress
246    /// This function will show a progress bar
247    /// # Arguments
248    /// * `self` - The annotations
249    /// # Returns
250    /// * `self` - The annotations with the annotations added
251    /// # Errors
252    /// This function will return an error if the texts are not loaded
253    pub fn annotate(&mut self) {
254        let pb = get_progress_bar(self.documents.len() as u64);
255        pb.set_message("Annotating texts");
256        let patterns = self
257            .entities
258            .iter()
259            .map(|entity| entity.name.as_str())
260            .collect::<Vec<&str>>();
261        // Check if apple is in the patterns
262        // if patterns.contains(&"apple") {
263        //     println!("Apple found in patterns");
264        // }
265        let aho_corasick = Arc::new(AhoCorasick::new(patterns));
266        self.documents.par_iter_mut().for_each(|document| {
267            let t: &mut String = &mut document.text;
268            if !self.config.texts.filters.case_sensitive {
269                *t = t.to_lowercase();
270            };
271            // ahocorasick implementation
272            let index = Quickner::find_index_using_aho_corasick(&t, &aho_corasick, &self.entities);
273            let mut index = match index {
274                Some(index) => index,
275                None => vec![],
276            };
277            index.sort_by(|a, b| a.0.cmp(&b.0));
278            document.label.extend(index);
279            pb.inc(1);
280        });
281        self.documents_hash = self
282            .documents
283            .iter()
284            .map(|document| (document.id.clone(), document.clone()))
285            .collect();
286        self.build_label_index();
287        self.build_entity_index();
288        pb.finish();
289    }
290
291    /// Creates a new instance of Quickner
292    /// If no configuration file is provided, the default configuration file is used.
293    /// Default: ./config.toml
294    /// # Arguments
295    /// * `config_file` - The path to the configuration file
296    /// # Example
297    /// ```
298    /// use quickner::Quickner;
299    /// let quickner = Quickner::new(Some("./config.toml"));
300    /// ```
301    /// # Panics
302    /// This function will panic if the configuration file does not exist
303    /// # Returns
304    /// * `Self` - The instance of Quickner
305    /// # Errors
306    /// This function will return an error if the configuration file does not exist
307    pub fn new(config_file: Option<&str>) -> Self {
308        let config_file = match config_file {
309            Some(config_file) => config_file.to_string(),
310            None => "./config.toml".to_string(),
311        };
312        // Check if the configuration file path exists
313        if Path::new(config_file.as_str()).exists() {
314            info!("Configuration file: {}", config_file.as_str());
315        } else {
316            warn!(
317                "Configuration file {} does not exist, using default Config",
318                config_file.as_str()
319            );
320            return Quickner::default();
321        }
322        let config = Config::from_file(config_file.as_str());
323        Quickner {
324            config,
325            config_file: Some(config_file),
326            ..Default::default()
327        }
328    }
329
330    pub fn add_document(&mut self, document: Document) {
331        {
332            let document = self.documents_hash.get(&document.id);
333            if document.is_some() {
334                warn!("Document {} already exists", document.unwrap().id);
335                return;
336            }
337        }
338        self.documents.push(document.to_owned());
339        self.documents_hash
340            .insert(document.id.to_owned(), document.to_owned());
341        self.add_to_entity_index(&document);
342        self.add_to_label_index(&document);
343    }
344
345    pub fn add_document_from_string(&mut self, text: &str) {
346        let document = Document::from_string(text.to_string());
347        self.documents.push(document.to_owned());
348        self.documents_hash
349            .insert(document.id.to_owned(), document.to_owned());
350        self.add_to_entity_index(&document);
351        self.add_to_label_index(&document);
352    }
353
354    pub fn add_entity(&mut self, entity: Entity) {
355        if self.entities.contains(&entity) {
356            warn!("Entity {} already exists", entity.name);
357            return;
358        }
359        self.entities.push(entity);
360    }
361
362    fn parse_config(&self) -> Config {
363        let mut config = self.config.clone();
364        config.entities.filters.set_special_characters();
365        config.texts.filters.set_special_characters();
366        let log_level_is_set = env::var("QUICKNER_LOG_LEVEL_SET").ok();
367        if log_level_is_set.is_none() {
368            match config.logging {
369                Some(ref mut logging) => {
370                    env_logger::Builder::from_env(
371                        env_logger::Env::default().default_filter_or(logging.level.as_str()),
372                    )
373                    .init();
374                    env::set_var("QUICKNER_LOG_LEVEL_SET", "true");
375                }
376                None => {
377                    env_logger::Builder::from_env(
378                        env_logger::Env::default().default_filter_or("info"),
379                    )
380                    .init();
381                    env::set_var("QUICKNER_LOG_LEVEL_SET", "true");
382                }
383            };
384        }
385
386        config
387    }
388
389    /// Process the texts and entities, and annotate the texts with the entities.
390    /// This method will return the annotations, and optionally save the annotations to a file.
391    /// # Arguments
392    /// * `self` - The instance of Quickner
393    /// * `save` - Whether to save the annotations to a file
394    /// # Example
395    /// ```
396    /// use quickner::Quickner;
397    /// let quickner = Quickner::new(Some("./config.toml"));
398    /// quickner.process(true);
399    /// ```
400    /// # Returns
401    /// * `Result<Annotations, Box<dyn Error>>` - The annotations
402    /// # Errors
403    /// This function will return an error if the configuration file does not exist
404    /// This function will return an error if the entities file does not exist
405    /// This function will return an error if the texts file does not exist
406    pub fn process(&mut self, save: bool) -> Result<(), Box<dyn Error>> {
407        let config = self.parse_config();
408        config.summary();
409        info!("----------------------------------------");
410        if self.entities.is_empty() {
411            let entities: HashSet<Entity> = self.entities(
412                config.entities.input.path.as_str(),
413                config.entities.filters,
414                config.entities.input.filter.unwrap_or(false),
415            );
416            self.entities = entities.into_iter().collect();
417        }
418        if self.documents.is_empty() {
419            let texts: HashSet<Text> = self.texts(
420                config.texts.input.path.as_str(),
421                config.texts.filters,
422                config.texts.input.filter.unwrap_or(false),
423            );
424            self.documents = texts
425                .par_iter()
426                .map(|text| Document::new((*text.text).to_string(), vec![]))
427                .collect();
428        }
429        let excludes: HashSet<String> = match config.entities.excludes.path {
430            Some(path) => {
431                info!("Reading excludes from {}", path.as_str());
432                self.excludes(path.as_str())
433            }
434            None => {
435                info!("No excludes file provided");
436                HashSet::new()
437            }
438        };
439        // Remove excludes from entities
440        let entities: HashSet<Entity> = self
441            .entities
442            .iter()
443            .filter(|entity| !excludes.contains(&entity.name))
444            .cloned()
445            .collect();
446        self.entities = Vec::from_iter(entities);
447        if !self.config.entities.filters.case_sensitive {
448            self.entities = self
449                .entities
450                .iter()
451                .map(|entity| Entity {
452                    name: entity.name.to_lowercase(),
453                    label: entity.label.to_string(),
454                })
455                .collect();
456        }
457        info!("{} entities found", self.entities.len());
458        self.annotate();
459        info!("{} annotations found", self.documents.len());
460        let len_entities = self.entities.len();
461        let len_documents = self.documents.len();
462        let number_of_checks = len_entities * len_documents;
463        // Transform number of checks to a human readable string
464        let number_of_checks = match number_of_checks {
465            0..=1000 => format!("{}", number_of_checks),
466            1001..=1000000 => format!("{:.2}K", number_of_checks as f64 / 1000.0),
467            1000001..=1000000000 => format!("{:.2}M", number_of_checks as f64 / 1000000.0),
468            _ => format!("{:.2}B", number_of_checks as f64 / 1000000000.0),
469        };
470        info!("Number of unique checks: {}", number_of_checks);
471        // annotations.save(&config.annotations.output.path);
472        if save {
473            let save = config
474                .annotations
475                .format
476                .save(&self.documents, &config.annotations.output.path);
477            match save {
478                Ok(_) => info!(
479                    "Annotations saved with format {:?}",
480                    config.annotations.format
481                ),
482                Err(e) => error!("Unable to save the annotations: {}", e),
483            }
484        }
485        // Transform annotations to Python objects
486        // List of tuples (text, [[start, end, label], [start, end, label], ...
487        // let annotations_py: Vec<(String, Vec<(usize, usize, String)>)> =
488        //     annotations.transform_annotations();
489        // Ok(annotations_py)
490        Ok(())
491    }
492
493    fn entities(&self, path: &str, filters: Filters, filter: bool) -> HashSet<Entity> {
494        // Read CSV file and parse it
495        // Expect columns: name, label
496        info!("Reading entities from {}", path);
497        let rdr = csv::Reader::from_path(path);
498        match rdr {
499            Ok(mut rdr) => {
500                let mut entities = HashSet::new();
501                for result in rdr.deserialize() {
502                    let record: Result<Entity, csv::Error> = result;
503                    match record {
504                        Ok(mut entity) => {
505                            if filter {
506                                if filters.is_valid(&entity.name) {
507                                    if !filters.case_sensitive {
508                                        entity.name = entity.name.to_lowercase();
509                                    }
510                                    entities.insert(entity);
511                                }
512                            } else {
513                                entities.insert(entity);
514                            }
515                        }
516                        Err(_) => {
517                            warn!("Unable to parse the entities file, using empty list");
518                            return HashSet::new();
519                        }
520                    }
521                }
522                entities
523            }
524            Err(_) => {
525                warn!("Unable to parse the entities file, using empty list");
526                HashSet::new()
527            }
528        }
529    }
530
531    fn texts(&self, path: &str, filters: Filters, filter: bool) -> HashSet<Text> {
532        // Read CSV file and parse it
533        // Expect columns: texts
534        info!("Reading texts from {}", path);
535        let rdr = csv::Reader::from_path(path);
536        match rdr {
537            Ok(mut rdr) => {
538                let mut texts = HashSet::new();
539                for result in rdr.deserialize() {
540                    let record: Result<Text, csv::Error> = result;
541                    match record {
542                        Ok(text) => {
543                            if filter {
544                                if filters.is_valid(&text.text) {
545                                    texts.insert(text);
546                                }
547                            } else {
548                                texts.insert(text);
549                            }
550                        }
551                        Err(e) => {
552                            error!("Unable to parse the texts file: {}", e);
553                            std::process::exit(1);
554                        }
555                    }
556                }
557                texts
558            }
559            Err(e) => {
560                error!("Unable to parse the texts file: {}", e);
561                std::process::exit(1);
562            }
563        }
564    }
565
566    fn excludes(&self, path: &str) -> HashSet<String> {
567        // Read CSV file and parse it
568        let rdr = csv::Reader::from_path(path);
569        match rdr {
570            Ok(mut rdr) => {
571                let mut excludes = HashSet::new();
572                for result in rdr.records() {
573                    let record = result.unwrap();
574                    excludes.insert(record[0].to_string());
575                }
576                excludes
577            }
578            Err(e) => {
579                error!("Unable to parse the excludes file: {}", e);
580                std::process::exit(1);
581            }
582        }
583    }
584
585    pub fn from_jsonl(path: &str) -> Quickner {
586        let file = File::open(path);
587        let file = match file {
588            Ok(file) => file,
589            Err(e) => {
590                error!("Unable to open the file {}: {}", path, e);
591                std::process::exit(1);
592            }
593        };
594        let reader = BufReader::new(file);
595        // Read the JSON objects from the file
596        // Parse each JSON object as Annotation and add it to the annotations
597        let mut entities = Vec::new();
598        let mut texts: Vec<Text> = Vec::new();
599        let documents: Vec<Document> = reader
600            .lines()
601            .map(|line| {
602                let line = line.unwrap();
603                let annotation: Document = serde_json::from_str(line.as_str()).unwrap();
604                let text = Text {
605                    text: (*annotation.text).to_string(),
606                };
607                texts.push(text);
608                // Extract the entity name from the label
609                for label in &annotation.label {
610                    let indices = char_to_byte((*annotation.text).to_string(), label.0, label.1);
611                    let name = annotation.text[indices.0..indices.1].to_string();
612                    let entity = Entity {
613                        name: name.to_string().to_lowercase(),
614                        label: label.2.to_string(),
615                    };
616                    entities.push(entity);
617                }
618                annotation
619            })
620            .collect();
621        let entities = Quickner::unique_entities(entities);
622        let documents_hash = Quickner::document_hash(&documents);
623        let mut quick = Quickner {
624            config: Config::default(),
625            config_file: None,
626            documents,
627            entities,
628            documents_hash,
629            documents_label_index: HashMap::new(),
630            documents_entities_index: HashMap::new(),
631        };
632        quick.build_entity_index();
633        quick.build_label_index();
634        quick
635    }
636
637    pub fn from_spacy(path: &str) -> Quickner {
638        let file = File::open(path);
639        let file = match file {
640            Ok(file) => file,
641            Err(e) => {
642                error!("Unable to open the file {}: {}", path, e);
643                std::process::exit(1);
644            }
645        };
646        let reader = BufReader::new(file);
647        // Read the JSON objects from the file
648        // Parse each JSON object as Annotation and add it to the annotations
649        let mut entities: Vec<Entity> = Vec::new();
650        let mut texts: Vec<Text> = Vec::new();
651        let spacy = serde_json::from_reader(reader);
652        let spacy: Vec<(String, SpacyEntity)> = match spacy {
653            Ok(spacy) => spacy,
654            Err(e) => {
655                error!("Unable to parse the file {}: {}", path, e);
656                std::process::exit(1);
657            }
658        };
659        let documents: Vec<Document> = spacy
660            .into_iter()
661            .map(|doc| {
662                let text = Text {
663                    text: (*doc.0).to_string(),
664                };
665                texts.push(text);
666                // Extract the entity name from the label
667                for ent in &doc.1.entity {
668                    let name = doc.0[ent.0..ent.1].to_string();
669                    let entity = Entity {
670                        name: name.to_lowercase(),
671                        label: ent.2.to_string(),
672                    };
673                    entities.push(entity);
674                }
675                Document::new(doc.0, doc.1.entity)
676            })
677            .collect();
678        let entities = Quickner::unique_entities(entities);
679        let documents_hash = Quickner::document_hash(&documents);
680        let mut quick = Quickner {
681            config: Config::default(),
682            config_file: None,
683            documents,
684            entities,
685            documents_hash,
686            documents_label_index: HashMap::new(),
687            documents_entities_index: HashMap::new(),
688        };
689        quick.build_entity_index();
690        quick.build_label_index();
691        quick
692    }
693
694    pub fn spacy(&self, chunks: Option<usize>) -> Vec<Vec<(String, SpacyEntity)>> {
695        let mut spacy: Vec<(String, SpacyEntity)> = Vec::new();
696        for document in &self.documents {
697            let mut entity: Vec<(usize, usize, String)> = Vec::new();
698            for label in &document.label {
699                entity.push((label.0, label.1, (*label.2).to_string()));
700            }
701            spacy.push(((*document.text).to_string(), SpacyEntity { entity }));
702        }
703        let chunks = match chunks {
704            Some(chunks) => chunks,
705            None => spacy.len(),
706        };
707        // Split the spacy vector into chunks
708        // i.e. if the vector has 1000 elements and the chunks is 100 then
709        // the vector will be split into 10 chunks of 100 elements each
710        let mut spacy_chunks: Vec<Vec<(String, SpacyEntity)>> = Vec::new();
711        for chunk in spacy.chunks(chunks) {
712            spacy_chunks.push(chunk.to_vec());
713        }
714        spacy_chunks
715    }
716}
717
718impl Quickner {
719    pub fn build_label_index(&mut self) {
720        let mut index: HashMap<String, Vec<String>> = HashMap::new();
721        for document in &self.documents {
722            for label in &document.label {
723                let entry = index.entry((*label.2).to_string()).or_insert(Vec::new());
724                entry.push((*document.id).to_string());
725            }
726        }
727        self.documents_label_index = index;
728    }
729
730    pub fn build_entity_index(&mut self) {
731        let mut index: HashMap<String, Vec<String>> = HashMap::new();
732        for document in &self.documents {
733            for label in &document.label {
734                // Translate the indices to byte indices
735                let indices = char_to_byte((*document.text).to_string(), label.0, label.1);
736                let name = document.text[indices.0..indices.1].to_string();
737                let entry = index.entry(name.to_lowercase()).or_insert(Vec::new());
738                entry.push((*document.id).to_string());
739            }
740        }
741        self.documents_entities_index = index;
742    }
743
744    fn add_to_label_index(&mut self, document: &Document) {
745        for label in &document.label {
746            let entry = self
747                .documents_label_index
748                .entry((*label.2).to_string())
749                .or_insert(Vec::new());
750            entry.push((*document.id).to_string());
751        }
752    }
753
754    fn add_to_entity_index(&mut self, document: &Document) {
755        for label in &document.label {
756            let indices = char_to_byte((*document.text).to_string(), label.0, label.1);
757            let name = document.text[indices.0..indices.1].to_string();
758            let entry = self
759                .documents_entities_index
760                .entry(name.to_lowercase())
761                .or_insert(Vec::new());
762            entry.push((*document.id).to_string());
763        }
764    }
765
766    fn _remove_from_label_index(&mut self, document: &Document) {
767        for label in &document.label {
768            let entry = self
769                .documents_label_index
770                .entry((*label.2).to_string())
771                .or_insert(Vec::new());
772            entry.retain(|x| x != &document.id);
773        }
774    }
775
776    fn _remove_from_entity_index(&mut self, document: &Document) {
777        for label in &document.label {
778            let indices = char_to_byte(document.text.clone(), label.0, label.1);
779            let name = document.text[indices.0..indices.1].to_string();
780            let entry = self
781                .documents_entities_index
782                .entry(name.to_lowercase())
783                .or_insert(Vec::new());
784            entry.retain(|x| x != &document.id);
785        }
786    }
787
788    fn unique_entities(entities: Vec<Entity>) -> Vec<Entity> {
789        entities
790            .into_iter()
791            .collect::<HashSet<Entity>>()
792            .into_iter()
793            .collect::<Vec<Entity>>()
794    }
795
796    pub fn document_hash(documents: &[Document]) -> HashMap<String, Document> {
797        documents
798            .iter()
799            .map(|document| (document.id.clone(), document.clone()))
800            .collect::<HashMap<String, Document>>()
801    }
802}