1use log::{debug, error};
10use serde::Deserialize;
11use std::{collections::HashSet, fs};
12use std::{fmt::Display, fmt::Formatter, iter::FromIterator};
13
14use crate::utils::{
15 contains_numbers, contains_punctuation, contains_special_characters, is_alphanumeric,
16};
17#[derive(Deserialize, Clone)]
28pub struct Config {
29 pub texts: Texts,
30 pub annotations: Annotations,
31 pub entities: Entities,
32 pub logging: Option<Logging>,
33}
34
35impl Default for Config {
36 fn default() -> Self {
37 Config {
38 texts: Texts::default(),
39 annotations: Annotations::default(),
40 entities: Entities::default(),
41 logging: Some(Logging::default()),
42 }
43 }
44}
45
46#[derive(Deserialize, Clone)]
48#[serde(default)]
49pub struct Logging {
50 pub level: String,
51}
52
53impl Default for Logging {
54 fn default() -> Self {
55 Logging {
56 level: "info".to_string(),
57 }
58 }
59}
60
61#[derive(Deserialize, Clone, Default)]
64pub struct Texts {
65 pub input: Input,
66 pub filters: Filters,
67}
68
69#[derive(Deserialize, Clone)]
71pub struct Input {
72 pub path: String,
73 pub filter: Option<bool>,
74}
75
76impl Default for Input {
77 fn default() -> Self {
78 Input {
79 path: "".to_string(),
80 filter: Some(true),
81 }
82 }
83}
84
85#[derive(Deserialize, Clone)]
87pub struct Filters {
88 pub alphanumeric: bool,
89 pub case_sensitive: bool,
90 pub min_length: i32,
91 pub max_length: i32,
92 pub punctuation: bool,
93 pub numbers: bool,
94 pub special_characters: bool,
95 pub accept_special_characters: Option<String>,
96 pub list_of_special_characters: Option<HashSet<char>>,
97}
98
99impl Default for Filters {
100 fn default() -> Self {
101 Filters {
102 alphanumeric: false,
103 case_sensitive: false,
104 min_length: 0,
105 max_length: 1024,
106 punctuation: false,
107 numbers: false,
108 special_characters: false,
109 accept_special_characters: None,
110 list_of_special_characters: Some(HashSet::new()),
111 }
112 }
113}
114
115impl Display for Filters {
116 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
117 write!(
118 f,
119 "alphanumeric: {}, case_sensitive: {}, min_length: {}, max_length: {}, punctuation: {}, numbers: {}, special_characters: {}, accept_special_characters: {:?}",
120 self.alphanumeric, self.case_sensitive, self.min_length, self.max_length, self.punctuation, self.numbers, self.special_characters, self.accept_special_characters
121 )
122 }
123}
124
125impl Filters {
126 pub fn set_special_characters(&mut self) {
127 let special_characters: HashSet<char> = HashSet::from_iter(vec![
128 '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '_', '=', '+', '[', ']', '{', '}',
129 ';', ':', '"', '\'', '<', '>', ',', '.', '?', '/', '\\', '|', '~', '`',
130 ]);
131 let accept_special_characters: HashSet<char> = self
132 .accept_special_characters
133 .as_ref()
134 .unwrap_or(&"".to_string())
135 .chars()
136 .collect();
137 self.list_of_special_characters = Some(
138 special_characters
139 .difference(&accept_special_characters)
140 .cloned()
141 .collect(),
142 );
143 }
144
145 pub fn get_special_characters(&self) -> HashSet<char> {
146 self.list_of_special_characters.as_ref().unwrap().clone()
147 }
148
149 pub fn is_valid(&self, text: &str) -> bool {
158 if text.is_empty() {
159 return false;
160 }
161 if self.alphanumeric && is_alphanumeric(text) {
163 debug!("{} is not alphanumeric", text);
164 return false;
165 }
166 if self.punctuation && contains_punctuation(text) {
167 debug!("'{}' contains punctuation", text);
168 return false;
169 }
170 if self.numbers && contains_numbers(text) {
171 debug!("{} does not contain numbers", text);
172 return false;
173 }
174 if self.special_characters
175 && contains_special_characters(text, self.get_special_characters())
176 {
177 debug!("{} contains special characters", text);
178 return false;
179 }
180 if self.min_length >= 0 && text.len() < self.min_length as usize {
181 debug!("{} is too short", text);
182 return false;
183 }
184 if self.max_length >= 0 && text.len() > self.max_length as usize {
185 return false;
186 }
187 true
188 }
189}
190
191#[derive(Debug, Deserialize, Clone, Default)]
193pub struct Annotations {
194 pub output: Output,
195 pub format: Format,
196}
197
198#[derive(Debug, Deserialize, Clone, Default)]
200pub enum Format {
201 #[serde(rename = "csv")]
202 Csv,
203 #[serde(rename = "jsonl")]
204 #[default]
205 Jsonl,
206 #[serde(rename = "spacy")]
207 Spacy,
208 #[serde(rename = "brat")]
209 Brat,
210 #[serde(rename = "conll")]
211 Conll,
212}
213
214#[derive(Debug, Deserialize, Clone, Default)]
216pub struct Output {
217 pub path: String,
218}
219
220#[derive(Deserialize, Clone, Default)]
222pub struct Entities {
223 pub input: Input,
224 pub filters: Filters,
225 pub excludes: Excludes,
226}
227
228#[derive(Debug, Deserialize, Clone, Default)]
230pub struct Excludes {
231 pub path: Option<String>,
232}
233
234impl Config {
235 pub fn from_file(path: &str) -> Self {
236 let config = fs::read_to_string(path).expect("Unable to read the configuration file");
237 let config = toml::from_str(&config);
238 match config {
239 Ok(config) => config,
240 Err(e) => {
241 error!("Unable to parse the configuration file: {}", e);
242 std::process::exit(1);
243 }
244 }
245 }
246
247 pub fn summary(&self) {
248 debug!("------------------------------");
249 debug!("Configuration file summary |");
250 debug!("------------------------------");
251 debug!("Texts input path: {}", self.texts.input.path);
252 debug!("Texts filters: {}", self.texts.filters);
253 debug!("Annotations output path: {}", self.annotations.output.path);
254 debug!("Entities input path: {}", self.entities.input.path);
255 debug!("Entities filters: {}", self.entities.filters);
256 debug!(
257 "Entities excludes path: {}",
258 self.entities
259 .excludes
260 .path
261 .as_ref()
262 .unwrap_or(&"None".to_string())
263 );
264 }
265}