1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
use serde::{Deserialize, Serialize};
use utils::hash_string;
use crate::entity::Entity;
use crate::quickner::Quickner;
use crate::utils;
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct Document {
pub id: String,
pub text: String,
pub label: Vec<(usize, usize, String)>,
}
impl PartialEq for Document {
fn eq(&self, other: &Self) -> bool {
self.id == other.id && self.text == other.text && self.label == other.label
}
}
impl Document {
pub fn from_string(text: String) -> Self {
let id = hash_string(text.as_str());
Document {
id,
text,
label: Vec::new(),
}
}
pub fn new(text: String, label: Vec<(usize, usize, String)>) -> Self {
let id = hash_string(text.as_str());
Self { id, text, label }
}
pub fn annotate(&mut self, mut entities: Vec<Entity>, case_sensitive: bool) {
if !case_sensitive {
self.text = self.text.to_lowercase();
entities
.iter_mut()
.for_each(|e| e.name = e.name.to_lowercase());
}
let label = Quickner::find_index(self.text.clone(), entities);
match label {
Some(label) => self.label.extend(label),
None => self.label.extend(Vec::new()),
}
self.label
.sort_by(|a, b| a.0.cmp(&b.0).then(a.1.cmp(&b.1)).then(a.2.cmp(&b.2)));
self.set_unique_labels();
}
fn set_unique_labels(&mut self) {
let mut labels: Vec<(usize, usize, String)> = Vec::new();
for (start, end, label) in &self.label {
if !labels.contains(&(*start, *end, label.clone())) {
labels.push((*start, *end, label.clone()));
}
}
self.label = labels;
}
}