1use crate::{
2 config::{Config, Filters},
3 models::Text,
4 utils::{char_to_byte, get_progress_bar, is_valid_utf8},
5 SpacyEntity,
6};
7use aho_corasick::AhoCorasick;
8use log::{error, info, warn};
9use rayon::prelude::*;
10use std::{collections::HashMap, path::Path, sync::Arc};
11use std::{
12 collections::HashSet,
13 fs::File,
14 io::{BufRead, BufReader},
15};
16use std::{env, error::Error};
17
18use crate::document::Document;
19use crate::entity::Entity;
20
21#[derive(Clone)]
24pub struct Quickner {
25 pub config: Config,
28 pub config_file: Option<String>,
29 pub documents: Vec<Document>,
30 pub entities: Vec<Entity>,
31 pub documents_hash: HashMap<String, Document>,
32 pub documents_label_index: HashMap<String, Vec<String>>,
33 pub documents_entities_index: HashMap<String, Vec<String>>,
34}
35
36impl Default for Quickner {
37 fn default() -> Self {
38 Self {
39 config: Config::default(),
40 config_file: Some("./config.toml".to_string()),
41 documents: Vec::new(),
42 entities: Vec::new(),
43 documents_hash: HashMap::new(),
44 documents_label_index: HashMap::new(),
45 documents_entities_index: HashMap::new(),
46 }
47 }
48}
49
50impl Quickner {
51 pub(crate) fn find_index(
69 text: String,
70 entities: Vec<Entity>,
71 ) -> Option<Vec<(usize, usize, String)>> {
72 let annotations = entities.iter().filter_map(|entity| {
74 let target_len = entity.name.len();
75 for (start, _) in text.match_indices(entity.name.as_str()) {
76 if start == 0
77 || text
78 .chars()
79 .nth(start - 1)
80 .unwrap_or_else(|| 'N')
81 .is_whitespace()
82 || text
83 .chars()
84 .nth(start - 1)
85 .unwrap_or_else(|| 'N')
86 .is_ascii_punctuation()
87 || ((start + target_len) == text.len()
88 || text
89 .chars()
90 .nth(start + target_len)
91 .unwrap_or('N')
92 .is_whitespace()
93 || (text
94 .chars()
95 .nth(start + target_len)
96 .unwrap_or('N')
97 .is_ascii_punctuation()
98 && text.chars().nth(start + target_len).unwrap() != '.'
99 && (start > 0 && text.chars().nth(start - 1).unwrap() != '.')))
100 {
101 return Some((start, start + target_len, entity.label.to_string()));
102 }
103 }
104 None
105 });
106 let mut annotations = annotations.collect::<Vec<(usize, usize, String)>>();
108 annotations.sort_by(|a, b| a.0.cmp(&b.0));
109 annotations.dedup();
110 if !annotations.is_empty() {
112 Some(annotations)
113 } else {
114 None
115 }
116 }
117
118 pub(crate) fn find_index_using_aho_corasick(
119 text: &str,
120 aho_corasick: &Arc<AhoCorasick>,
121 entites: &Vec<Entity>,
122 ) -> Option<Vec<(usize, usize, String)>> {
123 if !is_valid_utf8(text) {
124 warn!("Skipping invalid utf8 text: \"{}\"", text);
125 return None;
126 }
127 let mut annotations = Vec::new();
128 for mat in aho_corasick.find_overlapping_iter(&text) {
129 let start = mat.start();
130 let start = text[..start].chars().count();
132 let end = mat.end();
133 let end = text[..end].chars().count();
134 let label = entites[mat.pattern()].label.to_string();
135 let name = entites[mat.pattern()].name.to_string();
136 let target_len = name.len();
137 if start == 0
138 && (text.chars().nth(end).unwrap_or('N').is_whitespace()
139 || (text.chars().nth(end).unwrap_or('N').is_ascii_punctuation()))
140 {
141 annotations.push((start, end, label));
142 continue;
143 }
144 if start > 0
149 && text
150 .chars()
151 .nth(start - 1)
152 .unwrap_or_else(|| 'N')
153 .is_whitespace()
154 && (text.chars().nth(end).unwrap_or_else(|| 'N').is_whitespace()
155 || text
156 .chars()
157 .nth(end)
158 .unwrap_or_else(|| 'N')
159 .is_ascii_punctuation())
160 {
161 annotations.push((start, end, label));
162 continue;
163 }
164 if start > 0
165 && text
166 .chars()
167 .nth(start - 1)
168 .unwrap_or_else(|| 'N')
169 .is_ascii_punctuation()
170 && (text.chars().nth(end).unwrap_or_else(|| 'N').is_whitespace()
171 || text
172 .chars()
173 .nth(end)
174 .unwrap_or_else(|| 'N')
175 .is_ascii_punctuation())
176 {
177 annotations.push((start, end, label));
178 continue;
179 }
180 if (start + target_len) == text.len() {
181 annotations.push((start, end, label));
182 continue;
183 }
184 if (text
185 .chars()
186 .nth(start - 1)
187 .unwrap_or_else(|| 'N')
188 .is_ascii_punctuation()
189 || text
190 .chars()
191 .nth(start - 1)
192 .unwrap_or_else(|| 'N')
193 .is_whitespace())
194 && text
195 .chars()
196 .nth(start + target_len)
197 .unwrap_or('N')
198 .is_whitespace()
199 {
200 annotations.push((start, end, label));
201 continue;
202 }
203 if (text
204 .chars()
205 .nth(start - 1)
206 .unwrap_or_else(|| 'N')
207 .is_ascii_punctuation()
208 || text
209 .chars()
210 .nth(start - 1)
211 .unwrap_or_else(|| 'N')
212 .is_whitespace())
213 && text
214 .chars()
215 .nth(start + target_len)
216 .unwrap_or('N')
217 .is_ascii_punctuation()
218 && text.chars().nth(start + target_len).unwrap() != '.'
219 && (start > 0 && text.chars().nth(start - 1).unwrap() != '.')
220 {
221 annotations.push((start, end, label));
222 }
223 }
224 annotations.sort_by(|a, b| a.0.cmp(&b.0));
226 annotations.dedup();
227 if !annotations.is_empty() {
229 Some(annotations)
230 } else {
231 None
232 }
233 }
234
235 pub fn annotate(&mut self) {
254 let pb = get_progress_bar(self.documents.len() as u64);
255 pb.set_message("Annotating texts");
256 let patterns = self
257 .entities
258 .iter()
259 .map(|entity| entity.name.as_str())
260 .collect::<Vec<&str>>();
261 let aho_corasick = Arc::new(AhoCorasick::new(patterns));
266 self.documents.par_iter_mut().for_each(|document| {
267 let t: &mut String = &mut document.text;
268 if !self.config.texts.filters.case_sensitive {
269 *t = t.to_lowercase();
270 };
271 let index = Quickner::find_index_using_aho_corasick(&t, &aho_corasick, &self.entities);
273 let mut index = match index {
274 Some(index) => index,
275 None => vec![],
276 };
277 index.sort_by(|a, b| a.0.cmp(&b.0));
278 document.label.extend(index);
279 pb.inc(1);
280 });
281 self.documents_hash = self
282 .documents
283 .iter()
284 .map(|document| (document.id.clone(), document.clone()))
285 .collect();
286 self.build_label_index();
287 self.build_entity_index();
288 pb.finish();
289 }
290
291 pub fn new(config_file: Option<&str>) -> Self {
308 let config_file = match config_file {
309 Some(config_file) => config_file.to_string(),
310 None => "./config.toml".to_string(),
311 };
312 if Path::new(config_file.as_str()).exists() {
314 info!("Configuration file: {}", config_file.as_str());
315 } else {
316 warn!(
317 "Configuration file {} does not exist, using default Config",
318 config_file.as_str()
319 );
320 return Quickner::default();
321 }
322 let config = Config::from_file(config_file.as_str());
323 Quickner {
324 config,
325 config_file: Some(config_file),
326 ..Default::default()
327 }
328 }
329
330 pub fn add_document(&mut self, document: Document) {
331 {
332 let document = self.documents_hash.get(&document.id);
333 if document.is_some() {
334 warn!("Document {} already exists", document.unwrap().id);
335 return;
336 }
337 }
338 self.documents.push(document.to_owned());
339 self.documents_hash
340 .insert(document.id.to_owned(), document.to_owned());
341 self.add_to_entity_index(&document);
342 self.add_to_label_index(&document);
343 }
344
345 pub fn add_document_from_string(&mut self, text: &str) {
346 let document = Document::from_string(text.to_string());
347 self.documents.push(document.to_owned());
348 self.documents_hash
349 .insert(document.id.to_owned(), document.to_owned());
350 self.add_to_entity_index(&document);
351 self.add_to_label_index(&document);
352 }
353
354 pub fn add_entity(&mut self, entity: Entity) {
355 if self.entities.contains(&entity) {
356 warn!("Entity {} already exists", entity.name);
357 return;
358 }
359 self.entities.push(entity);
360 }
361
362 fn parse_config(&self) -> Config {
363 let mut config = self.config.clone();
364 config.entities.filters.set_special_characters();
365 config.texts.filters.set_special_characters();
366 let log_level_is_set = env::var("QUICKNER_LOG_LEVEL_SET").ok();
367 if log_level_is_set.is_none() {
368 match config.logging {
369 Some(ref mut logging) => {
370 env_logger::Builder::from_env(
371 env_logger::Env::default().default_filter_or(logging.level.as_str()),
372 )
373 .init();
374 env::set_var("QUICKNER_LOG_LEVEL_SET", "true");
375 }
376 None => {
377 env_logger::Builder::from_env(
378 env_logger::Env::default().default_filter_or("info"),
379 )
380 .init();
381 env::set_var("QUICKNER_LOG_LEVEL_SET", "true");
382 }
383 };
384 }
385
386 config
387 }
388
389 pub fn process(&mut self, save: bool) -> Result<(), Box<dyn Error>> {
407 let config = self.parse_config();
408 config.summary();
409 info!("----------------------------------------");
410 if self.entities.is_empty() {
411 let entities: HashSet<Entity> = self.entities(
412 config.entities.input.path.as_str(),
413 config.entities.filters,
414 config.entities.input.filter.unwrap_or(false),
415 );
416 self.entities = entities.into_iter().collect();
417 }
418 if self.documents.is_empty() {
419 let texts: HashSet<Text> = self.texts(
420 config.texts.input.path.as_str(),
421 config.texts.filters,
422 config.texts.input.filter.unwrap_or(false),
423 );
424 self.documents = texts
425 .par_iter()
426 .map(|text| Document::new((*text.text).to_string(), vec![]))
427 .collect();
428 }
429 let excludes: HashSet<String> = match config.entities.excludes.path {
430 Some(path) => {
431 info!("Reading excludes from {}", path.as_str());
432 self.excludes(path.as_str())
433 }
434 None => {
435 info!("No excludes file provided");
436 HashSet::new()
437 }
438 };
439 let entities: HashSet<Entity> = self
441 .entities
442 .iter()
443 .filter(|entity| !excludes.contains(&entity.name))
444 .cloned()
445 .collect();
446 self.entities = Vec::from_iter(entities);
447 if !self.config.entities.filters.case_sensitive {
448 self.entities = self
449 .entities
450 .iter()
451 .map(|entity| Entity {
452 name: entity.name.to_lowercase(),
453 label: entity.label.to_string(),
454 })
455 .collect();
456 }
457 info!("{} entities found", self.entities.len());
458 self.annotate();
459 info!("{} annotations found", self.documents.len());
460 let len_entities = self.entities.len();
461 let len_documents = self.documents.len();
462 let number_of_checks = len_entities * len_documents;
463 let number_of_checks = match number_of_checks {
465 0..=1000 => format!("{}", number_of_checks),
466 1001..=1000000 => format!("{:.2}K", number_of_checks as f64 / 1000.0),
467 1000001..=1000000000 => format!("{:.2}M", number_of_checks as f64 / 1000000.0),
468 _ => format!("{:.2}B", number_of_checks as f64 / 1000000000.0),
469 };
470 info!("Number of unique checks: {}", number_of_checks);
471 if save {
473 let save = config
474 .annotations
475 .format
476 .save(&self.documents, &config.annotations.output.path);
477 match save {
478 Ok(_) => info!(
479 "Annotations saved with format {:?}",
480 config.annotations.format
481 ),
482 Err(e) => error!("Unable to save the annotations: {}", e),
483 }
484 }
485 Ok(())
491 }
492
493 fn entities(&self, path: &str, filters: Filters, filter: bool) -> HashSet<Entity> {
494 info!("Reading entities from {}", path);
497 let rdr = csv::Reader::from_path(path);
498 match rdr {
499 Ok(mut rdr) => {
500 let mut entities = HashSet::new();
501 for result in rdr.deserialize() {
502 let record: Result<Entity, csv::Error> = result;
503 match record {
504 Ok(mut entity) => {
505 if filter {
506 if filters.is_valid(&entity.name) {
507 if !filters.case_sensitive {
508 entity.name = entity.name.to_lowercase();
509 }
510 entities.insert(entity);
511 }
512 } else {
513 entities.insert(entity);
514 }
515 }
516 Err(_) => {
517 warn!("Unable to parse the entities file, using empty list");
518 return HashSet::new();
519 }
520 }
521 }
522 entities
523 }
524 Err(_) => {
525 warn!("Unable to parse the entities file, using empty list");
526 HashSet::new()
527 }
528 }
529 }
530
531 fn texts(&self, path: &str, filters: Filters, filter: bool) -> HashSet<Text> {
532 info!("Reading texts from {}", path);
535 let rdr = csv::Reader::from_path(path);
536 match rdr {
537 Ok(mut rdr) => {
538 let mut texts = HashSet::new();
539 for result in rdr.deserialize() {
540 let record: Result<Text, csv::Error> = result;
541 match record {
542 Ok(text) => {
543 if filter {
544 if filters.is_valid(&text.text) {
545 texts.insert(text);
546 }
547 } else {
548 texts.insert(text);
549 }
550 }
551 Err(e) => {
552 error!("Unable to parse the texts file: {}", e);
553 std::process::exit(1);
554 }
555 }
556 }
557 texts
558 }
559 Err(e) => {
560 error!("Unable to parse the texts file: {}", e);
561 std::process::exit(1);
562 }
563 }
564 }
565
566 fn excludes(&self, path: &str) -> HashSet<String> {
567 let rdr = csv::Reader::from_path(path);
569 match rdr {
570 Ok(mut rdr) => {
571 let mut excludes = HashSet::new();
572 for result in rdr.records() {
573 let record = result.unwrap();
574 excludes.insert(record[0].to_string());
575 }
576 excludes
577 }
578 Err(e) => {
579 error!("Unable to parse the excludes file: {}", e);
580 std::process::exit(1);
581 }
582 }
583 }
584
585 pub fn from_jsonl(path: &str) -> Quickner {
586 let file = File::open(path);
587 let file = match file {
588 Ok(file) => file,
589 Err(e) => {
590 error!("Unable to open the file {}: {}", path, e);
591 std::process::exit(1);
592 }
593 };
594 let reader = BufReader::new(file);
595 let mut entities = Vec::new();
598 let mut texts: Vec<Text> = Vec::new();
599 let documents: Vec<Document> = reader
600 .lines()
601 .map(|line| {
602 let line = line.unwrap();
603 let annotation: Document = serde_json::from_str(line.as_str()).unwrap();
604 let text = Text {
605 text: (*annotation.text).to_string(),
606 };
607 texts.push(text);
608 for label in &annotation.label {
610 let indices = char_to_byte((*annotation.text).to_string(), label.0, label.1);
611 let name = annotation.text[indices.0..indices.1].to_string();
612 let entity = Entity {
613 name: name.to_string().to_lowercase(),
614 label: label.2.to_string(),
615 };
616 entities.push(entity);
617 }
618 annotation
619 })
620 .collect();
621 let entities = Quickner::unique_entities(entities);
622 let documents_hash = Quickner::document_hash(&documents);
623 let mut quick = Quickner {
624 config: Config::default(),
625 config_file: None,
626 documents,
627 entities,
628 documents_hash,
629 documents_label_index: HashMap::new(),
630 documents_entities_index: HashMap::new(),
631 };
632 quick.build_entity_index();
633 quick.build_label_index();
634 quick
635 }
636
637 pub fn from_spacy(path: &str) -> Quickner {
638 let file = File::open(path);
639 let file = match file {
640 Ok(file) => file,
641 Err(e) => {
642 error!("Unable to open the file {}: {}", path, e);
643 std::process::exit(1);
644 }
645 };
646 let reader = BufReader::new(file);
647 let mut entities: Vec<Entity> = Vec::new();
650 let mut texts: Vec<Text> = Vec::new();
651 let spacy = serde_json::from_reader(reader);
652 let spacy: Vec<(String, SpacyEntity)> = match spacy {
653 Ok(spacy) => spacy,
654 Err(e) => {
655 error!("Unable to parse the file {}: {}", path, e);
656 std::process::exit(1);
657 }
658 };
659 let documents: Vec<Document> = spacy
660 .into_iter()
661 .map(|doc| {
662 let text = Text {
663 text: (*doc.0).to_string(),
664 };
665 texts.push(text);
666 for ent in &doc.1.entity {
668 let name = doc.0[ent.0..ent.1].to_string();
669 let entity = Entity {
670 name: name.to_lowercase(),
671 label: ent.2.to_string(),
672 };
673 entities.push(entity);
674 }
675 Document::new(doc.0, doc.1.entity)
676 })
677 .collect();
678 let entities = Quickner::unique_entities(entities);
679 let documents_hash = Quickner::document_hash(&documents);
680 let mut quick = Quickner {
681 config: Config::default(),
682 config_file: None,
683 documents,
684 entities,
685 documents_hash,
686 documents_label_index: HashMap::new(),
687 documents_entities_index: HashMap::new(),
688 };
689 quick.build_entity_index();
690 quick.build_label_index();
691 quick
692 }
693
694 pub fn spacy(&self, chunks: Option<usize>) -> Vec<Vec<(String, SpacyEntity)>> {
695 let mut spacy: Vec<(String, SpacyEntity)> = Vec::new();
696 for document in &self.documents {
697 let mut entity: Vec<(usize, usize, String)> = Vec::new();
698 for label in &document.label {
699 entity.push((label.0, label.1, (*label.2).to_string()));
700 }
701 spacy.push(((*document.text).to_string(), SpacyEntity { entity }));
702 }
703 let chunks = match chunks {
704 Some(chunks) => chunks,
705 None => spacy.len(),
706 };
707 let mut spacy_chunks: Vec<Vec<(String, SpacyEntity)>> = Vec::new();
711 for chunk in spacy.chunks(chunks) {
712 spacy_chunks.push(chunk.to_vec());
713 }
714 spacy_chunks
715 }
716}
717
718impl Quickner {
719 pub fn build_label_index(&mut self) {
720 let mut index: HashMap<String, Vec<String>> = HashMap::new();
721 for document in &self.documents {
722 for label in &document.label {
723 let entry = index.entry((*label.2).to_string()).or_insert(Vec::new());
724 entry.push((*document.id).to_string());
725 }
726 }
727 self.documents_label_index = index;
728 }
729
730 pub fn build_entity_index(&mut self) {
731 let mut index: HashMap<String, Vec<String>> = HashMap::new();
732 for document in &self.documents {
733 for label in &document.label {
734 let indices = char_to_byte((*document.text).to_string(), label.0, label.1);
736 let name = document.text[indices.0..indices.1].to_string();
737 let entry = index.entry(name.to_lowercase()).or_insert(Vec::new());
738 entry.push((*document.id).to_string());
739 }
740 }
741 self.documents_entities_index = index;
742 }
743
744 fn add_to_label_index(&mut self, document: &Document) {
745 for label in &document.label {
746 let entry = self
747 .documents_label_index
748 .entry((*label.2).to_string())
749 .or_insert(Vec::new());
750 entry.push((*document.id).to_string());
751 }
752 }
753
754 fn add_to_entity_index(&mut self, document: &Document) {
755 for label in &document.label {
756 let indices = char_to_byte((*document.text).to_string(), label.0, label.1);
757 let name = document.text[indices.0..indices.1].to_string();
758 let entry = self
759 .documents_entities_index
760 .entry(name.to_lowercase())
761 .or_insert(Vec::new());
762 entry.push((*document.id).to_string());
763 }
764 }
765
766 fn _remove_from_label_index(&mut self, document: &Document) {
767 for label in &document.label {
768 let entry = self
769 .documents_label_index
770 .entry((*label.2).to_string())
771 .or_insert(Vec::new());
772 entry.retain(|x| x != &document.id);
773 }
774 }
775
776 fn _remove_from_entity_index(&mut self, document: &Document) {
777 for label in &document.label {
778 let indices = char_to_byte(document.text.clone(), label.0, label.1);
779 let name = document.text[indices.0..indices.1].to_string();
780 let entry = self
781 .documents_entities_index
782 .entry(name.to_lowercase())
783 .or_insert(Vec::new());
784 entry.retain(|x| x != &document.id);
785 }
786 }
787
788 fn unique_entities(entities: Vec<Entity>) -> Vec<Entity> {
789 entities
790 .into_iter()
791 .collect::<HashSet<Entity>>()
792 .into_iter()
793 .collect::<Vec<Entity>>()
794 }
795
796 pub fn document_hash(documents: &[Document]) -> HashMap<String, Document> {
797 documents
798 .iter()
799 .map(|document| (document.id.clone(), document.clone()))
800 .collect::<HashMap<String, Document>>()
801 }
802}