1use log::{debug, error};
10use serde::Deserialize;
11use std::{collections::HashSet, fs};
12use std::{fmt::Display, fmt::Formatter, iter::FromIterator};
13
14use crate::utils::{
15 contains_numbers, contains_punctuation, contains_special_characters, is_alphanumeric,
16};
17#[derive(Deserialize, Clone)]
28pub struct Config {
29 pub texts: Texts,
30 pub annotations: Annotations,
31 pub entities: Entities,
32 pub logging: Option<Logging>,
33}
34
35#[derive(Deserialize, Clone)]
38#[serde(default)]
39pub struct Logging {
40 pub level: String,
41}
42
43impl Default for Logging {
44 fn default() -> Self {
45 Logging {
46 level: "info".to_string(),
47 }
48 }
49}
50
51#[derive(Deserialize, Clone)]
52pub struct Texts {
53 pub input: Input,
54 pub filters: Filters,
55}
56
57#[derive(Deserialize, Clone)]
58pub struct Input {
59 pub path: String,
60 pub filter: Option<bool>,
61}
62
63#[derive(Deserialize, Clone)]
64pub struct Filters {
65 pub alphanumeric: bool,
66 pub case_sensitive: bool,
67 pub min_length: i32,
68 pub max_length: i32,
69 pub punctuation: bool,
70 pub numbers: bool,
71 pub special_characters: bool,
72 pub accept_special_characters: Option<String>,
73 pub list_of_special_characters: Option<HashSet<char>>,
74}
75
76impl Default for Filters {
77 fn default() -> Self {
78 Filters {
79 alphanumeric: false,
80 case_sensitive: false,
81 min_length: 0,
82 max_length: 1024,
83 punctuation: false,
84 numbers: false,
85 special_characters: false,
86 accept_special_characters: None,
87 list_of_special_characters: Some(HashSet::new()),
88 }
89 }
90}
91
92impl Display for Filters {
93 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
94 write!(
95 f,
96 "alphanumeric: {}, case_sensitive: {}, min_length: {}, max_length: {}, punctuation: {}, numbers: {}, special_characters: {}, accept_special_characters: {:?}",
97 self.alphanumeric, self.case_sensitive, self.min_length, self.max_length, self.punctuation, self.numbers, self.special_characters, self.accept_special_characters
98 )
99 }
100}
101
102impl Filters {
104 pub fn set_special_characters(&mut self) {
105 let special_characters: HashSet<char> = HashSet::from_iter(vec![
106 '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '_', '=', '+', '[', ']', '{', '}',
107 ';', ':', '"', '\'', '<', '>', ',', '.', '?', '/', '\\', '|', '~', '`',
108 ]);
109 let accept_special_characters: HashSet<char> = self
110 .accept_special_characters
111 .as_ref()
112 .unwrap_or(&"".to_string())
113 .chars()
114 .collect();
115 self.list_of_special_characters = Some(
116 special_characters
117 .difference(&accept_special_characters)
118 .cloned()
119 .collect(),
120 );
121 }
122
123 pub fn get_special_characters(&self) -> HashSet<char> {
124 self.list_of_special_characters.as_ref().unwrap().clone()
125 }
126
127 pub fn is_valid(&self, text: &str) -> bool {
136 if text.is_empty() {
137 return false;
138 }
139 if self.alphanumeric && is_alphanumeric(text) {
141 debug!("{} is not alphanumeric", text);
142 return false;
143 }
144 if self.punctuation && contains_punctuation(text) {
145 debug!("'{}' contains punctuation", text);
146 return false;
147 }
148 if self.numbers && contains_numbers(text) {
149 debug!("{} does not contain numbers", text);
150 return false;
151 }
152 if self.special_characters
153 && contains_special_characters(text, self.get_special_characters())
154 {
155 debug!("{} contains special characters", text);
156 return false;
157 }
158 if self.min_length >= 0 && text.len() < self.min_length as usize {
159 debug!("{} is too short", text);
160 return false;
161 }
162 if self.max_length >= 0 && text.len() > self.max_length as usize {
163 return false;
164 }
165 true
166 }
167}
168
169#[derive(Debug, Deserialize, Clone)]
170pub struct Annotations {
171 pub output: Output,
172 pub format: Format,
173}
174
175#[derive(Debug, Deserialize, Clone)]
176pub enum Format {
177 #[serde(rename = "csv")]
178 Csv,
179 #[serde(rename = "jsonl")]
180 Jsonl,
181 #[serde(rename = "spacy")]
182 Spacy,
183 #[serde(rename = "brat")]
184 Brat,
185 #[serde(rename = "conll")]
186 Conll,
187}
188#[derive(Debug, Deserialize, Clone)]
189pub struct Output {
190 pub path: String,
191}
192
193#[derive(Deserialize, Clone)]
194pub struct Entities {
195 pub input: Input,
196 pub filters: Filters,
197 pub excludes: Excludes,
198}
199
200#[derive(Debug, Deserialize, Clone)]
201pub struct Excludes {
202 pub path: Option<String>,
203}
204
205impl Config {
206 pub fn from_file(path: &str) -> Self {
207 let config = fs::read_to_string(path).expect("Unable to read the configuration file");
208 let config = toml::from_str(&config);
209 match config {
210 Ok(config) => config,
211 Err(e) => {
212 error!("Unable to parse the configuration file: {}", e);
213 std::process::exit(1);
214 }
215 }
216 }
217
218 pub fn summary(&self) {
219 debug!("------------------------------");
220 debug!("Configuration file summary |");
221 debug!("------------------------------");
222 debug!("Texts input path: {}", self.texts.input.path);
223 debug!("Texts filters: {}", self.texts.filters);
224 debug!("Annotations output path: {}", self.annotations.output.path);
225 debug!("Entities input path: {}", self.entities.input.path);
226 debug!("Entities filters: {}", self.entities.filters);
227 debug!(
228 "Entities excludes path: {}",
229 self.entities
230 .excludes
231 .path
232 .as_ref()
233 .unwrap_or(&"None".to_string())
234 );
235 }
236}