quickner/
document.rs

1use std::sync::Arc;
2
3use aho_corasick::AhoCorasick;
4use serde::{Deserialize, Serialize};
5use utils::hash_string;
6
7use crate::entity::Entity;
8use crate::quickner::Quickner;
9use crate::utils;
10/// An annotation is a text with a set of entities
11///
12/// This object is used to hold the text and the
13/// entities found in the text.
14#[derive(Serialize, Deserialize, Clone, Debug)]
15pub struct Document {
16    pub id: String,
17    pub text: String,
18    pub label: Vec<(usize, usize, String)>,
19}
20
21impl PartialEq for Document {
22    fn eq(&self, other: &Self) -> bool {
23        self.id == other.id && self.text == other.text && self.label == other.label
24    }
25}
26
27impl Document {
28    /// Create an annotation from a string
29    /// # Examples
30    /// ```
31    /// use quickner::models::Annotation;
32    ///
33    /// let annotation = Annotation::from_string("Rust is developed by Mozilla".to_string());
34    /// assert_eq!(annotation.text, "Rust is developed by Mozilla");
35    /// ```
36    pub fn from_string(text: String) -> Self {
37        let id = hash_string(text.as_str());
38        Document {
39            id,
40            text,
41            label: Vec::new(),
42        }
43    }
44
45    pub fn new(text: String, label: Vec<(usize, usize, String)>) -> Self {
46        let id = hash_string(text.as_str());
47        Self { id, text, label }
48    }
49
50    /// Annotate text given a set of entities
51    /// # Examples
52    /// ```
53    /// use quickner::models::Document;
54    /// use quickner::models::Entity;
55    /// use std::collections::HashSet;
56    ///
57    /// let mut annotation = Annotation::from_string("Rust is developed by Mozilla".to_string());
58    /// let entities = vec![
59    ///    Entity::new("Rust".to_string(), "Language".to_string()),
60    ///    Entity::new("Mozilla".to_string(), "Organization".to_string()),
61    /// ].into_iter().collect();
62    /// annotation.annotate(entities);
63    /// assert_eq!(annotation.label, vec![(0, 4, "Language".to_string()), (23, 30, "Organization".to_string())]);
64    /// ```
65    pub fn annotate(&mut self, mut entities: Vec<Entity>, case_sensitive: bool) {
66        if !case_sensitive {
67            self.text = self.text.to_lowercase();
68            entities
69                .iter_mut()
70                .for_each(|e| e.name = e.name.to_lowercase());
71        }
72        let patterns = entities
73            .iter()
74            .map(|entity| entity.name.as_str())
75            .collect::<Vec<&str>>();
76        let aho_corasick = Arc::new(AhoCorasick::new(patterns));
77        let label = Quickner::find_index_using_aho_corasick(&self.text, &aho_corasick, &entities);
78        match label {
79            Some(label) => self.label.extend(label),
80            None => self.label.extend(Vec::new()),
81        }
82        // Remove duplicate labels based on start and end index and label
83        self.label
84            .sort_by(|a, b| a.0.cmp(&b.0).then(a.1.cmp(&b.1)).then(a.2.cmp(&b.2)));
85        self.set_unique_labels();
86    }
87
88    fn set_unique_labels(&mut self) {
89        let mut labels: Vec<(usize, usize, String)> = Vec::new();
90        for (start, end, label) in &self.label {
91            if !labels.contains(&(*start, *end, label.to_string())) {
92                labels.push((*start, *end, label.to_string()));
93            }
94        }
95        self.label = labels;
96    }
97}