test_data_generation/engine/mod.rs
1//!
2//!
3//! # Fact
4//! The Fact object is a representation of a character based on its context within a data entity.
5//! Facts are created during the analyze process and then later used to generate data from the algorithm.
6//!
7//! ## Example
8//!
9//! ```rust
10//! extern crate test_data_generation;
11//!
12//! use test_data_generation::engine::Fact;
13//!
14//! fn main() {
15//! //fact created for the character 'r' in the string "word"
16//! let mut fact = Fact::new('r','c',0,0,2);
17//!
18//! // set the char that appears after the 'r'
19//! fact.set_next_key('d');
20//!
21//! // set the char that appears before the 'r'
22//! fact.set_prior_key('o');
23//! }
24//! ```
25//!
26//! # PatternDefinition
27//! The PatternDefinition provides functionality to retrieve symbols that are used in defining a pattern.
28//!
29//! Here is the list of symbols that identify a type of character:</br>
30//! @ = unknown [Unknonw]</br>
31//! C = upper case consonant [ConsonantUpper]</br>
32//! c = lower case consonant [ConsonantLower]</br>
33//! V = upper case vowel [VowelUpper]</br>
34//! v = lower case vowel [VowelLower]</br>
35//! \# = numeric digit [Numeric]</br>
36//! ~ = special regex character [RegExSpcChar]</br>
37//! S = white space [WhiteSpace]</br>
38//! p = punctuation [Punctuation]</br>
39//!
40//! ## Example
41//!
42//! ```rust
43//! extern crate test_data_generation;
44//!
45//! use test_data_generation::engine::PatternDefinition;
46//!
47//! fn main() {
48//! let pttrn_def = PatternDefinition::new();
49//! println!("Upper case vowel symbol: {:?}", pttrn_def.get(&"VowelUpper".to_string()));
50//! }
51//! ```
52
53use regex::Regex;
54use serde_json;
55use std::collections::BTreeMap;
56use std::sync::mpsc;
57use std::sync::mpsc::{Receiver, Sender};
58use std::thread;
59
60use crate::Profile;
61//use async_trait::async_trait;
62
63macro_rules! regex {
64 ($re:literal $(,)?) => {{
65 static RE: once_cell::sync::OnceCell<regex::Regex> = once_cell::sync::OnceCell::new();
66 RE.get_or_init(|| regex::Regex::new($re).unwrap())
67 }};
68}
69
70#[allow(dead_code)]
71type PatternMap = BTreeMap<String, char>;
72
73#[derive(Clone, Serialize, Deserialize, Debug)]
74/// Represents a Fact for a character in a sample data entity that has been analyzed
75pub struct Fact {
76 /// the char that the fact defines (.e.g: 'a', '1', '%', etc.)
77 pub key: char,
78 /// the char that appears before (-1) the key in the entity
79 pub prior_key: Option<char>,
80 /// the char that appears after (+1) the key in the entity
81 pub next_key: Option<char>,
82 /// the PatternPlaceholder symbol that represents the type of key
83 pub pattern_placeholder: char,
84 /// indicates if the key is the first char in the entity (0=no, 1=yes)
85 pub starts_with: u32,
86 /// indicates if the key is the last char in the entity (0=no, 1=yes)
87 pub ends_with: u32,
88 /// indicates the number of positions from the index zero (where the char is located in the entity from the first position)
89 pub index_offset: u32,
90}
91
92impl Fact {
93 /// Constructs a new Fact
94 ///
95 /// # Arguments
96 ///
97 /// * `k: char` - The char that the Fact represents (also known as the `key`).</br>
98 /// * `pp: char` - The char that represents the patter placeholder for the key.</br>
99 /// * `sw: u32` - Indicates is the key is the first char in the entity. (0=no, 1=yes)</br>
100 /// * `ew: u32` - Indicates is the key is the last char in the entity. (0=no, 1=yes)</br>
101 /// * `idx_off: u32` - The index that represents the postion of the key from the beginning of the entity (zero based).</br>
102 ///
103 /// # Example
104 ///
105 /// ```rust
106 /// extern crate test_data_generation;
107 ///
108 /// use test_data_generation::engine::Fact;
109 ///
110 /// fn main() {
111 /// //fact created for the character 'r' in the string "word"
112 /// let mut fact = Fact::new('r','c',0,0,2);
113 /// }
114 /// ```
115 #[inline]
116 pub fn new(k: char, pp: char, sw: u32, ew: u32, idx_off: u32) -> Fact {
117 Fact {
118 key: k,
119 prior_key: None,
120 next_key: None,
121 pattern_placeholder: pp,
122 starts_with: sw,
123 ends_with: ew,
124 index_offset: idx_off,
125 }
126 }
127
128 /// Constructs a new Fact from a serialized (JSON) string of the Fact object. This is used when restoring from "archive"
129 ///
130 /// # Arguments
131 ///
132 /// * `serialized: &str` - The JSON string that represents the archived Fact object.</br>
133 ///
134 /// # Example
135 ///
136 /// ```rust
137 /// extern crate test_data_generation;
138 ///
139 /// use test_data_generation::engine::Fact;
140 ///
141 /// fn main() {
142 /// let serialized = "{\"key\":\"r\",\"prior_key\":null,\"next_key\":null,\"pattern_placeholder\":\"c\",\"starts_with\":0,\"ends_with\":0,\"index_offset\":2}";
143 /// let mut fact = Fact::from_serialized(&serialized);
144 /// fact.set_prior_key('a');
145 /// fact.set_next_key('e');
146 ///
147 /// assert_eq!(fact.pattern_placeholder, 'c');
148 /// }
149 /// ```
150 #[inline]
151 pub fn from_serialized(serialized: &str) -> Fact {
152 serde_json::from_str(&serialized).unwrap()
153 }
154
155 /// This function converts the Fact to a serialize JSON string.
156 ///
157 /// # Example
158 ///
159 /// ```rust
160 /// extern crate test_data_generation;
161 ///
162 /// use test_data_generation::engine::Fact;
163 ///
164 /// fn main() {
165 /// //fact created for the character 'r' in the string "word"
166 /// let mut fact = Fact::new('r','c',0,0,2);
167 ///
168 /// println!("{}", fact.serialize());
169 /// // {"key":"r","prior_key":null,"next_key":null,"pattern_placeholder":"c","starts_with":0,"ends_with":0,"index_offset":2}
170 /// }
171 ///
172 #[inline]
173 pub fn serialize(&mut self) -> String {
174 serde_json::to_string(&self).unwrap()
175 }
176
177 /// This function sets the next key attribute to the specified char.
178 ///
179 /// # Arguments
180 ///
181 /// * `nk: char` - The character that represents the next character in the entity
182 ///
183 /// # Example
184 ///
185 /// ```rust
186 /// extern crate test_data_generation;
187 ///
188 /// use test_data_generation::engine::Fact;
189 ///
190 /// fn main() {
191 /// //fact created for the character 'r' in the string "word"
192 /// let mut fact = Fact::new('r','c',0,0,2);
193 /// fact.set_next_key('d');
194 /// }
195 ///
196 #[inline]
197 pub fn set_next_key(&mut self, nk: char) {
198 self.next_key = Some(nk);
199 }
200
201 /// This function sets the prior key attribute to the specified char.
202 ///
203 /// # Arguments
204 ///
205 /// * `pk: char` - The character that represents the prior character in the entity
206 ///
207 /// # Example
208 ///
209 /// ```rust
210 /// extern crate test_data_generation;
211 ///
212 /// use test_data_generation::engine::Fact;
213 ///
214 /// fn main() {
215 /// //fact created for the character 'r' in the string "word"
216 /// let mut fact = Fact::new('r','c',0,0,2);
217 /// fact.set_prior_key('o');
218 /// }
219 ///
220 #[inline]
221 pub fn set_prior_key(&mut self, pk: char) {
222 self.prior_key = Some(pk);
223 }
224}
225
226/// Represents a symbolic pattern of an entity (String)
227pub struct Pattern {
228 /// The regex rule used to find upper case consonants
229 regex_consonant_upper: &'static Regex,
230 /// The regex rule used to find lower case consonants
231 regex_consonant_lower: &'static Regex,
232 /// The regex rule used to find upper case vowels
233 regex_vowel_upper: &'static Regex,
234 /// The regex rule used to find lower case vowels
235 regex_vowel_lower: &'static Regex,
236 /// The regex rule used to find numeric digits
237 regex_numeric: &'static Regex,
238 /// The regex rule used to find punctuation
239 regex_punctuation: &'static Regex,
240 /// The regex rule used to find white spaces
241 regex_space: &'static Regex,
242}
243
244impl Default for Pattern {
245 fn default() -> Self {
246 Pattern {
247 regex_consonant_upper: regex!(r"(?-u)[B-DF-HJ-NP-TV-Z]"),
248 regex_consonant_lower: regex!(r"(?-u)[b-df-hj-np-tv-z]"),
249 regex_vowel_upper: regex!(r"(?-u)[A|E|I|O|U]"),
250 regex_vowel_lower: regex!(r"(?-u)[a|e|i|o|u]"),
251 regex_numeric: regex!(r"(?-u)[0-9]"),
252 regex_punctuation: regex!(r"(?-u)[.,\\/#!$%\\^&\\*;:{}=\\-_`~()\\?]"),
253 regex_space: regex!(r"(?-u)[\s]"),
254 }
255 }
256}
257
258/// Represents the object managing all the symbols used in pattern definitions
259pub struct PatternDefinition {
260 pattern_map: PatternMap,
261 pattern: Pattern,
262}
263
264impl PatternDefinition {
265 /// Constructs a new PatternDefinition
266 ///
267 /// # Example
268 ///
269 /// ```rust
270 /// extern crate test_data_generation;
271 ///
272 /// use test_data_generation::engine::PatternDefinition;
273 ///
274 /// fn main() {
275 /// let pttrn_def = PatternDefinition::new();
276 /// }
277 /// ```
278 pub fn new() -> PatternDefinition {
279 let symbols: [char; 9] = ['@', 'C', 'c', 'V', 'v', '#', '~', 'S', 'p'];
280 let mut pttrn_def = PatternMap::new();
281
282 pttrn_def.insert("Unknown".to_string(), symbols[0]);
283 pttrn_def.insert("ConsonantUpper".to_string(), symbols[1]);
284 pttrn_def.insert("ConsonantLower".to_string(), symbols[2]);
285 pttrn_def.insert("VowelUpper".to_string(), symbols[3]);
286 pttrn_def.insert("VowelLower".to_string(), symbols[4]);
287 pttrn_def.insert("Numeric".to_string(), symbols[5]);
288 pttrn_def.insert("RegExSpcChar".to_string(), symbols[6]);
289 pttrn_def.insert("WhiteSpace".to_string(), symbols[7]);
290 pttrn_def.insert("Punctuation".to_string(), symbols[8]);
291
292 PatternDefinition {
293 pattern_map: pttrn_def,
294 pattern: Pattern::default(),
295 }
296 }
297
298 /// This function converts an entity (&str) into a tuplet (String, Vec<Fact>)</br>
299 ///
300 /// # Arguments
301 ///
302 /// * `entity: String` - The textual str of the value to analyze.</br>
303 ///
304 /// # Example
305 ///
306 /// ```rust
307 /// extern crate test_data_generation;
308 ///
309 /// use test_data_generation::engine::PatternDefinition;
310 ///
311 /// fn main() {
312 /// let mut pttrn_def = PatternDefinition::new();
313 /// //async {
314 /// let rslt = pttrn_def.analyze("Hello World");
315 /// assert_eq!(rslt.0, "CvccvSCvccc");
316 /// //}
317 /// }
318 /// ```
319 #[inline]
320 pub fn analyze(&mut self, entity: &str) -> (String, Vec<Fact>) {
321 // record the length of the passed value
322 //self.size = entity.len() as u32;
323
324 // String to hold the pattern
325 let mut pttrn = String::new();
326
327 // Vec to hold all the Facts to be returned
328 let mut facts = Vec::new();
329
330 // record the pattern of the passed value
331 for (i, _c) in entity.chars().enumerate() {
332 //let fact = self.factualize(&entity, i as u32);
333 let idx: u32 = i as u32;
334 let fact = self.factualize(entity, idx);
335 pttrn.push_str(&*fact.pattern_placeholder.to_string());
336 facts.push(fact);
337 }
338
339 (pttrn, facts)
340 }
341
342 /// This function converts a char in an entity (&str) based on the index specified into a Fact</br>
343 ///
344 /// # Arguments
345 ///
346 /// * `entity: String` - The textual str of the value to analyze.</br>
347 /// * `idx: u32` - The index that specifies the position of the char in the entity to convert to a Fact.</br>
348 ///
349 /// # Example
350 ///
351 /// ```rust
352 /// extern crate test_data_generation;
353 ///
354 /// use test_data_generation::engine::PatternDefinition;
355 ///
356 /// fn main() {
357 /// let mut pttrn_def = PatternDefinition::new();
358 /// let fact = pttrn_def.factualize("Word",0);
359 /// // will return a Fact that represents the char `W`
360 /// }
361 /// ```
362 #[inline]
363 pub fn factualize(&mut self, entity: &str, idx: u32) -> Fact {
364 let c = entity.chars().nth(idx as usize).unwrap();
365 let pp = self.symbolize_char(c);
366 let pk = if idx > 0 {
367 entity.chars().nth(idx as usize - 1)
368 } else {
369 None
370 };
371 let nk = if idx < entity.len() as u32 - 1 {
372 entity.chars().nth(idx as usize + 1)
373 } else {
374 None
375 };
376 let sw = if idx == 0 { 1 } else { 0 };
377 let ew = if idx == entity.len() as u32 - 1 { 1 } else { 0 };
378
379 let mut fact = Fact::new(c, pp, sw, ew, idx);
380
381 // only if there is a next key
382 if nk.is_some() {
383 let _ = &fact.set_next_key(nk.unwrap());
384 }
385
386 // only if there is a prior key
387 if pk.is_some() {
388 let _ = &fact.set_prior_key(pk.unwrap());
389 }
390
391 fact
392 }
393
394 /// This function returns a pattern symbol that represents the type of character
395 ///
396 /// # Example
397 ///
398 /// ```rust
399 /// extern crate test_data_generation;
400 ///
401 /// use test_data_generation::engine::PatternDefinition;
402 ///
403 /// fn main() {
404 /// let pttrn_def = PatternDefinition::new();
405 /// println!("Upper case vowel symbol: {:?}", pttrn_def.get(&"VowelUpper".to_string()));
406 /// }
407 /// ```
408 #[inline]
409 pub fn get(&self, key: &str) -> char {
410 *self.pattern_map.get(key).unwrap()
411 }
412
413 /// This function converts a char into a pattern symbol
414 ///
415 /// # Example
416 ///
417 /// ```rust
418 /// extern crate test_data_generation;
419 ///
420 /// use test_data_generation::engine::PatternDefinition;
421 ///
422 /// fn main() {
423 /// let pttrn_def = PatternDefinition::new();
424 /// println!("The pattern symbol for 'A' is {:?}", pttrn_def.symbolize_char('A'));
425 /// // The pattern symbol for 'A' is V
426 /// }
427 /// ```
428 #[inline]
429 pub fn symbolize_char(&self, c: char) -> char {
430 // if you have to escape regex special characters: &*regex::escape(&*$c.to_string())
431 let mut symbol = self.pattern_map.get("Unknown");
432 let mut found = false;
433
434 if !found && self.pattern.regex_consonant_upper.is_match(&c.to_string()) {
435 symbol = self.pattern_map.get("ConsonantUpper");
436 found = true;
437 }
438
439 if !found && self.pattern.regex_consonant_lower.is_match(&c.to_string()) {
440 symbol = self.pattern_map.get("ConsonantLower");
441 found = true;
442 }
443
444 if !found && self.pattern.regex_vowel_upper.is_match(&c.to_string()) {
445 symbol = self.pattern_map.get("VowelUpper");
446 found = true;
447 }
448
449 if !found && self.pattern.regex_vowel_lower.is_match(&c.to_string()) {
450 symbol = self.pattern_map.get("VowelLower");
451 found = true;
452 }
453
454 if !found && self.pattern.regex_numeric.is_match(&c.to_string()) {
455 symbol = self.pattern_map.get("Numeric");
456 found = true;
457 }
458
459 if !found && self.pattern.regex_space.is_match(&c.to_string()) {
460 symbol = self.pattern_map.get("WhiteSpace");
461 found = true;
462 }
463
464 if !found && self.pattern.regex_punctuation.is_match(&c.to_string()) {
465 symbol = self.pattern_map.get("Punctuation");
466 found = true;
467 }
468
469 // if not matched, then use "Unknown" placeholder symbol
470 if !found {
471 symbol = self.pattern_map.get("Unknown");
472 }
473
474 *symbol.unwrap()
475 }
476}
477
478pub trait Engine {
479 fn analyze_entities(entities: Vec<String>) -> Vec<(String, Vec<Fact>)> {
480 let (tx, rx): (Sender<(String, Vec<Fact>)>, Receiver<(String, Vec<Fact>)>) =
481 mpsc::channel();
482 let mut children = Vec::new();
483
484 for entity in entities.clone() {
485 let thread_tx = tx.clone();
486 let child = thread::spawn(move || {
487 thread_tx
488 .send(PatternDefinition::new().analyze(&entity))
489 .unwrap();
490 debug!("PatternDefinition::analyze thread finished for {}", entity);
491 });
492
493 children.push(child);
494 }
495
496 let mut results = Vec::new();
497 for entity in entities {
498 results.push(match rx.recv() {
499 Ok(result) => result,
500 Err(_) => {
501 error!("Error: Could not analyze the entity: {}", entity);
502 panic!("Error: Could not analyze the data!")
503 }
504 });
505 }
506
507 for child in children {
508 child.join().expect("Error: Could not analyze the data!");
509 }
510
511 results
512 }
513
514 fn profile_entities(mut profile: Profile, entities: Vec<String>) -> Result<Profile, String> {
515 let results = Self::analyze_entities(entities);
516
517 for result in results {
518 match profile.apply_facts(result.0, result.1) {
519 Ok(_) => {}
520 Err(e) => {
521 return Err(format!(
522 "Error: Couldn't apply the Pattern and Facts to the Profile. Error Message: {}",
523 e.to_string()
524 ))
525 }
526 }
527 }
528
529 Ok(profile)
530 }
531
532 fn profile_entities_with_container(container: EngineContainer) -> Result<Profile, String> {
533 Self::profile_entities(container.profile, container.entities)
534 }
535}
536
537pub struct EngineContainer {
538 pub profile: Profile,
539 pub entities: Vec<String>,
540}
541
542// Unit Tests
543#[cfg(test)]
544mod tests {
545 use super::*;
546
547 struct Xtest {}
548 impl Engine for Xtest {}
549
550 #[test]
551 fn test_fact_new() {
552 //fact created for the character 'r' in the string "word"
553 let _fact = Fact::new('r', 'c', 0, 0, 2);
554
555 assert!(true);
556 }
557
558 #[test]
559 fn test_fact_new_from_serialized() {
560 let serialized = "{\"key\":\"r\",\"prior_key\":null,\"next_key\":null,\"pattern_placeholder\":\"c\",\"starts_with\":0,\"ends_with\":0,\"index_offset\":2}";
561 let fact = Fact::from_serialized(&serialized);
562 assert_eq!(fact.pattern_placeholder, 'c');
563 }
564
565 #[test]
566 fn test_fact_serialize() {
567 //fact created for the character 'r' in the string "word"
568 let mut fact = Fact::new('r', 'c', 0, 0, 2);
569 let serialized = fact.serialize();
570
571 assert_eq!(serialized,"{\"key\":\"r\",\"prior_key\":null,\"next_key\":null,\"pattern_placeholder\":\"c\",\"starts_with\":0,\"ends_with\":0,\"index_offset\":2}");
572 }
573
574 #[test]
575 fn test_fact_set_next_key() {
576 //fact created for the character 'r' in the string "word"
577 let mut fact = Fact::new('r', 'c', 0, 0, 2);
578 fact.set_next_key('d');
579 }
580
581 #[test]
582 fn test_fact_set_prior_key() {
583 //fact created for the character 'r' in the string "word"
584 let mut fact = Fact::new('r', 'c', 0, 0, 2);
585 fact.set_prior_key('o');
586 }
587
588 #[test]
589 fn test_pattern_definition_new() {
590 let pttrn_def = PatternDefinition::new();
591 assert_eq!(pttrn_def.get("VowelUpper"), 'V');
592 }
593
594 #[test]
595 fn test_pattern_definition_symbolize_char() {
596 let pttrn_def = PatternDefinition::new();
597
598 assert_eq!(pttrn_def.symbolize_char('A'), 'V');
599 }
600
601 #[test]
602 fn test_pattern_definition_factualize() {
603 let mut pttrn_def = PatternDefinition::new();
604 let mut fact1 = pttrn_def.factualize("Word", 1);
605 let mut fact2 = Fact::new('o', 'v', 0, 0, 1);
606 fact2.set_prior_key('W');
607 fact2.set_next_key('r');
608
609 assert_eq!(fact1.serialize(), fact2.serialize());
610 }
611
612 #[test]
613 fn test_pattern_definition_analyze() {
614 let mut pttrn_def = PatternDefinition::new();
615 let word = pttrn_def.analyze("HELlo0?^@");
616
617 assert_eq!(word.0, "CVCcv#pp@");
618 assert_eq!(word.1.len(), 9);
619 }
620
621 #[test]
622 fn test_pattern_definition_analyze_multithread() {
623 let words = vec![
624 "word-one".to_string(),
625 "word-two".to_string(),
626 "word-three".to_string(),
627 "word-four".to_string(),
628 "word-five".to_string(),
629 ];
630
631 let results = Xtest::analyze_entities(words);
632
633 println!("{:?}", results);
634 assert_eq!(results.len(), 5);
635 }
636
637 #[test]
638 fn test_profile_entities() {
639 //async {
640 let profile = Profile::new();
641 let words = vec![
642 "word-one".to_string(),
643 "word-two".to_string(),
644 "word-three".to_string(),
645 "word-four".to_string(),
646 "word-five".to_string(),
647 ];
648 let result = Xtest::profile_entities(profile, words);
649 assert!(result.is_ok());
650 //};
651 }
652}