quickner/
config.rs

1// quickner
2//
3// NER tool for quick and simple NER annotation
4// Copyright (C) 2023, Omar MHAIMDAT
5//
6// Licensed under Mozilla Public License 2.0
7//
8
9use log::{debug, error};
10use serde::Deserialize;
11use std::{collections::HashSet, fs};
12use std::{fmt::Display, fmt::Formatter, iter::FromIterator};
13
14use crate::utils::{
15    contains_numbers, contains_punctuation, contains_special_characters, is_alphanumeric,
16};
17/// A struct representing the configuration file.
18/// # Examples
19/// ```
20/// use config::Config;
21/// let config = Config::from_file("./config.toml");
22/// ```
23/// # Panics
24/// Panics if the configuration file cannot be read or parsed.
25/// # Errors
26/// Returns an error if the configuration file cannot be read or parsed.
27#[derive(Deserialize, Clone)]
28pub struct Config {
29    pub texts: Texts,
30    pub annotations: Annotations,
31    pub entities: Entities,
32    pub logging: Option<Logging>,
33}
34
35impl Default for Config {
36    fn default() -> Self {
37        Config {
38            texts: Texts::default(),
39            annotations: Annotations::default(),
40            entities: Entities::default(),
41            logging: Some(Logging::default()),
42        }
43    }
44}
45
46/// A struct used to deserialize logging from the configuration file.
47#[derive(Deserialize, Clone)]
48#[serde(default)]
49pub struct Logging {
50    pub level: String,
51}
52
53impl Default for Logging {
54    fn default() -> Self {
55        Logging {
56            level: "info".to_string(),
57        }
58    }
59}
60
61/// A struct used to deserialize annotations from the configuration file.
62
63#[derive(Deserialize, Clone, Default)]
64pub struct Texts {
65    pub input: Input,
66    pub filters: Filters,
67}
68
69/// A struct used to deserialize input from the configuration file.
70#[derive(Deserialize, Clone)]
71pub struct Input {
72    pub path: String,
73    pub filter: Option<bool>,
74}
75
76impl Default for Input {
77    fn default() -> Self {
78        Input {
79            path: "".to_string(),
80            filter: Some(true),
81        }
82    }
83}
84
85/// A struct used to deserialize filters from the configuration file.
86#[derive(Deserialize, Clone)]
87pub struct Filters {
88    pub alphanumeric: bool,
89    pub case_sensitive: bool,
90    pub min_length: i32,
91    pub max_length: i32,
92    pub punctuation: bool,
93    pub numbers: bool,
94    pub special_characters: bool,
95    pub accept_special_characters: Option<String>,
96    pub list_of_special_characters: Option<HashSet<char>>,
97}
98
99impl Default for Filters {
100    fn default() -> Self {
101        Filters {
102            alphanumeric: false,
103            case_sensitive: false,
104            min_length: 0,
105            max_length: 1024,
106            punctuation: false,
107            numbers: false,
108            special_characters: false,
109            accept_special_characters: None,
110            list_of_special_characters: Some(HashSet::new()),
111        }
112    }
113}
114
115impl Display for Filters {
116    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
117        write!(
118            f,
119            "alphanumeric: {}, case_sensitive: {}, min_length: {}, max_length: {}, punctuation: {}, numbers: {}, special_characters: {}, accept_special_characters: {:?}",
120            self.alphanumeric, self.case_sensitive, self.min_length, self.max_length, self.punctuation, self.numbers, self.special_characters, self.accept_special_characters
121        )
122    }
123}
124
125impl Filters {
126    pub fn set_special_characters(&mut self) {
127        let special_characters: HashSet<char> = HashSet::from_iter(vec![
128            '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '_', '=', '+', '[', ']', '{', '}',
129            ';', ':', '"', '\'', '<', '>', ',', '.', '?', '/', '\\', '|', '~', '`',
130        ]);
131        let accept_special_characters: HashSet<char> = self
132            .accept_special_characters
133            .as_ref()
134            .unwrap_or(&"".to_string())
135            .chars()
136            .collect();
137        self.list_of_special_characters = Some(
138            special_characters
139                .difference(&accept_special_characters)
140                .cloned()
141                .collect(),
142        );
143    }
144
145    pub fn get_special_characters(&self) -> HashSet<char> {
146        self.list_of_special_characters.as_ref().unwrap().clone()
147    }
148
149    /// Checks if a string is a valid entity.
150    /// Using the configuration file, it checks if the string is alphanumeric, contains punctuation, numbers, or special characters.
151    /// # Examples
152    /// ```
153    /// use utils::is_valid;
154    /// let text = "Hello, world!";
155    /// assert_eq!(is_valid(config, text), true);
156    /// ```
157    pub fn is_valid(&self, text: &str) -> bool {
158        if text.is_empty() {
159            return false;
160        }
161        // False
162        if self.alphanumeric && is_alphanumeric(text) {
163            debug!("{} is not alphanumeric", text);
164            return false;
165        }
166        if self.punctuation && contains_punctuation(text) {
167            debug!("'{}' contains punctuation", text);
168            return false;
169        }
170        if self.numbers && contains_numbers(text) {
171            debug!("{} does not contain numbers", text);
172            return false;
173        }
174        if self.special_characters
175            && contains_special_characters(text, self.get_special_characters())
176        {
177            debug!("{} contains special characters", text);
178            return false;
179        }
180        if self.min_length >= 0 && text.len() < self.min_length as usize {
181            debug!("{} is too short", text);
182            return false;
183        }
184        if self.max_length >= 0 && text.len() > self.max_length as usize {
185            return false;
186        }
187        true
188    }
189}
190
191/// A struct used to deserialize annotations from the configuration file.
192#[derive(Debug, Deserialize, Clone, Default)]
193pub struct Annotations {
194    pub output: Output,
195    pub format: Format,
196}
197
198/// A struct used to deserialize output format from the configuration file.
199#[derive(Debug, Deserialize, Clone, Default)]
200pub enum Format {
201    #[serde(rename = "csv")]
202    Csv,
203    #[serde(rename = "jsonl")]
204    #[default]
205    Jsonl,
206    #[serde(rename = "spacy")]
207    Spacy,
208    #[serde(rename = "brat")]
209    Brat,
210    #[serde(rename = "conll")]
211    Conll,
212}
213
214/// A struct used to deserialize output from the configuration file.
215#[derive(Debug, Deserialize, Clone, Default)]
216pub struct Output {
217    pub path: String,
218}
219
220/// A struct used to deserialize entities from the configuration file.
221#[derive(Deserialize, Clone, Default)]
222pub struct Entities {
223    pub input: Input,
224    pub filters: Filters,
225    pub excludes: Excludes,
226}
227
228/// A struct used to deserialize excludes from the configuration file.
229#[derive(Debug, Deserialize, Clone, Default)]
230pub struct Excludes {
231    pub path: Option<String>,
232}
233
234impl Config {
235    pub fn from_file(path: &str) -> Self {
236        let config = fs::read_to_string(path).expect("Unable to read the configuration file");
237        let config = toml::from_str(&config);
238        match config {
239            Ok(config) => config,
240            Err(e) => {
241                error!("Unable to parse the configuration file: {}", e);
242                std::process::exit(1);
243            }
244        }
245    }
246
247    pub fn summary(&self) {
248        debug!("------------------------------");
249        debug!("Configuration file summary    |");
250        debug!("------------------------------");
251        debug!("Texts input path: {}", self.texts.input.path);
252        debug!("Texts filters: {}", self.texts.filters);
253        debug!("Annotations output path: {}", self.annotations.output.path);
254        debug!("Entities input path: {}", self.entities.input.path);
255        debug!("Entities filters: {}", self.entities.filters);
256        debug!(
257            "Entities excludes path: {}",
258            self.entities
259                .excludes
260                .path
261                .as_ref()
262                .unwrap_or(&"None".to_string())
263        );
264    }
265}