1use std::sync::Arc;
2
3use aho_corasick::AhoCorasick;
4use serde::{Deserialize, Serialize};
5use utils::hash_string;
6
7use crate::entity::Entity;
8use crate::quickner::Quickner;
9use crate::utils;
10#[derive(Serialize, Deserialize, Clone, Debug)]
15pub struct Document {
16 pub id: String,
17 pub text: String,
18 pub label: Vec<(usize, usize, String)>,
19}
20
21impl PartialEq for Document {
22 fn eq(&self, other: &Self) -> bool {
23 self.id == other.id && self.text == other.text && self.label == other.label
24 }
25}
26
27impl Document {
28 pub fn from_string(text: String) -> Self {
37 let id = hash_string(text.as_str());
38 Document {
39 id,
40 text,
41 label: Vec::new(),
42 }
43 }
44
45 pub fn new(text: String, label: Vec<(usize, usize, String)>) -> Self {
46 let id = hash_string(text.as_str());
47 Self { id, text, label }
48 }
49
50 pub fn annotate(&mut self, mut entities: Vec<Entity>, case_sensitive: bool) {
66 if !case_sensitive {
67 self.text = self.text.to_lowercase();
68 entities
69 .iter_mut()
70 .for_each(|e| e.name = e.name.to_lowercase());
71 }
72 let patterns = entities
73 .iter()
74 .map(|entity| entity.name.as_str())
75 .collect::<Vec<&str>>();
76 let aho_corasick = Arc::new(AhoCorasick::new(patterns));
77 let label = Quickner::find_index_using_aho_corasick(&self.text, &aho_corasick, &entities);
78 match label {
79 Some(label) => self.label.extend(label),
80 None => self.label.extend(Vec::new()),
81 }
82 self.label
84 .sort_by(|a, b| a.0.cmp(&b.0).then(a.1.cmp(&b.1)).then(a.2.cmp(&b.2)));
85 self.set_unique_labels();
86 }
87
88 fn set_unique_labels(&mut self) {
89 let mut labels: Vec<(usize, usize, String)> = Vec::new();
90 for (start, end, label) in &self.label {
91 if !labels.contains(&(*start, *end, label.to_string())) {
92 labels.push((*start, *end, label.to_string()));
93 }
94 }
95 self.label = labels;
96 }
97}