quickner/
config.rs

1// quickner
2//
3// NER tool for quick and simple NER annotation
4// Copyright (C) 2023, Omar MHAIMDAT
5//
6// Licensed under Mozilla Public License 2.0
7//
8
9use log::{debug, error};
10use serde::Deserialize;
11use std::{collections::HashSet, fs};
12use std::{fmt::Display, fmt::Formatter, iter::FromIterator};
13
14use crate::utils::{
15    contains_numbers, contains_punctuation, contains_special_characters, is_alphanumeric,
16};
17/// A struct representing the configuration file.
18/// # Examples
19/// ```
20/// use config::Config;
21/// let config = Config::from_file("./config.toml");
22/// ```
23/// # Panics
24/// Panics if the configuration file cannot be read or parsed.
25/// # Errors
26/// Returns an error if the configuration file cannot be read or parsed.
27#[derive(Deserialize, Clone)]
28pub struct Config {
29    pub texts: Texts,
30    pub annotations: Annotations,
31    pub entities: Entities,
32    pub logging: Option<Logging>,
33}
34
35// [logging]
36// level = "info"
37#[derive(Deserialize, Clone)]
38#[serde(default)]
39pub struct Logging {
40    pub level: String,
41}
42
43impl Default for Logging {
44    fn default() -> Self {
45        Logging {
46            level: "info".to_string(),
47        }
48    }
49}
50
51#[derive(Deserialize, Clone)]
52pub struct Texts {
53    pub input: Input,
54    pub filters: Filters,
55}
56
57#[derive(Deserialize, Clone)]
58pub struct Input {
59    pub path: String,
60    pub filter: Option<bool>,
61}
62
63#[derive(Deserialize, Clone)]
64pub struct Filters {
65    pub alphanumeric: bool,
66    pub case_sensitive: bool,
67    pub min_length: i32,
68    pub max_length: i32,
69    pub punctuation: bool,
70    pub numbers: bool,
71    pub special_characters: bool,
72    pub accept_special_characters: Option<String>,
73    pub list_of_special_characters: Option<HashSet<char>>,
74}
75
76impl Default for Filters {
77    fn default() -> Self {
78        Filters {
79            alphanumeric: false,
80            case_sensitive: false,
81            min_length: 0,
82            max_length: 1024,
83            punctuation: false,
84            numbers: false,
85            special_characters: false,
86            accept_special_characters: None,
87            list_of_special_characters: Some(HashSet::new()),
88        }
89    }
90}
91
92impl Display for Filters {
93    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
94        write!(
95            f,
96            "alphanumeric: {}, case_sensitive: {}, min_length: {}, max_length: {}, punctuation: {}, numbers: {}, special_characters: {}, accept_special_characters: {:?}",
97            self.alphanumeric, self.case_sensitive, self.min_length, self.max_length, self.punctuation, self.numbers, self.special_characters, self.accept_special_characters
98        )
99    }
100}
101
102// Post init function to set the special characters
103impl Filters {
104    pub fn set_special_characters(&mut self) {
105        let special_characters: HashSet<char> = HashSet::from_iter(vec![
106            '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '_', '=', '+', '[', ']', '{', '}',
107            ';', ':', '"', '\'', '<', '>', ',', '.', '?', '/', '\\', '|', '~', '`',
108        ]);
109        let accept_special_characters: HashSet<char> = self
110            .accept_special_characters
111            .as_ref()
112            .unwrap_or(&"".to_string())
113            .chars()
114            .collect();
115        self.list_of_special_characters = Some(
116            special_characters
117                .difference(&accept_special_characters)
118                .cloned()
119                .collect(),
120        );
121    }
122
123    pub fn get_special_characters(&self) -> HashSet<char> {
124        self.list_of_special_characters.as_ref().unwrap().clone()
125    }
126
127    /// Checks if a string is a valid entity.
128    /// Using the configuration file, it checks if the string is alphanumeric, contains punctuation, numbers, or special characters.
129    /// # Examples
130    /// ```
131    /// use utils::is_valid;
132    /// let text = "Hello, world!";
133    /// assert_eq!(is_valid(config, text), true);
134    /// ```
135    pub fn is_valid(&self, text: &str) -> bool {
136        if text.is_empty() {
137            return false;
138        }
139        // False
140        if self.alphanumeric && is_alphanumeric(text) {
141            debug!("{} is not alphanumeric", text);
142            return false;
143        }
144        if self.punctuation && contains_punctuation(text) {
145            debug!("'{}' contains punctuation", text);
146            return false;
147        }
148        if self.numbers && contains_numbers(text) {
149            debug!("{} does not contain numbers", text);
150            return false;
151        }
152        if self.special_characters
153            && contains_special_characters(text, self.get_special_characters())
154        {
155            debug!("{} contains special characters", text);
156            return false;
157        }
158        if self.min_length >= 0 && text.len() < self.min_length as usize {
159            debug!("{} is too short", text);
160            return false;
161        }
162        if self.max_length >= 0 && text.len() > self.max_length as usize {
163            return false;
164        }
165        true
166    }
167}
168
169#[derive(Debug, Deserialize, Clone)]
170pub struct Annotations {
171    pub output: Output,
172    pub format: Format,
173}
174
175#[derive(Debug, Deserialize, Clone)]
176pub enum Format {
177    #[serde(rename = "csv")]
178    Csv,
179    #[serde(rename = "jsonl")]
180    Jsonl,
181    #[serde(rename = "spacy")]
182    Spacy,
183    #[serde(rename = "brat")]
184    Brat,
185    #[serde(rename = "conll")]
186    Conll,
187}
188#[derive(Debug, Deserialize, Clone)]
189pub struct Output {
190    pub path: String,
191}
192
193#[derive(Deserialize, Clone)]
194pub struct Entities {
195    pub input: Input,
196    pub filters: Filters,
197    pub excludes: Excludes,
198}
199
200#[derive(Debug, Deserialize, Clone)]
201pub struct Excludes {
202    pub path: Option<String>,
203}
204
205impl Config {
206    pub fn from_file(path: &str) -> Self {
207        let config = fs::read_to_string(path).expect("Unable to read the configuration file");
208        let config = toml::from_str(&config);
209        match config {
210            Ok(config) => config,
211            Err(e) => {
212                error!("Unable to parse the configuration file: {}", e);
213                std::process::exit(1);
214            }
215        }
216    }
217
218    pub fn summary(&self) {
219        debug!("------------------------------");
220        debug!("Configuration file summary    |");
221        debug!("------------------------------");
222        debug!("Texts input path: {}", self.texts.input.path);
223        debug!("Texts filters: {}", self.texts.filters);
224        debug!("Annotations output path: {}", self.annotations.output.path);
225        debug!("Entities input path: {}", self.entities.input.path);
226        debug!("Entities filters: {}", self.entities.filters);
227        debug!(
228            "Entities excludes path: {}",
229            self.entities
230                .excludes
231                .path
232                .as_ref()
233                .unwrap_or(&"None".to_string())
234        );
235    }
236}