test_data_generation/lib.rs
1// Copyright 2018 David Sietz and [`test-data-generator` contributors](https://github.com/dsietz/test-data-generator/blob/master/CONTRIBUTORS.md).
2// Licensed under the MIT license
3// (see LICENSE or <https://opensource.org/licenses/Apache-2.0>)
4//
5//!
6//! The are multiple ways to use the Test Data Generation library. It all depends on your intent.
7//!
8//! ### Profile
9//!
10//! The easiest way is to use a Profile. The `profile` module provides functionality to create a profile on a data sample (Strings).
11//! Once a profile has been made, data can be generated by calling the _pre_generate()_ and _generate()_ functions, in that order.
12//!
13//! ```
14//! extern crate test_data_generation;
15//!
16//! use test_data_generation::Profile;
17//!
18//! fn main() {
19//! // analyze the dataset
20//! let mut data_profile = Profile::new();
21//!
22//! // analyze the dataset
23//! data_profile.analyze("Smith, John");
24//! data_profile.analyze("Doe, John");
25//! data_profile.analyze("Dale, Danny");
26//! data_profile.analyze("Rickets, Ronney");
27//!
28//! // confirm 4 data samples were analyzed
29//! assert_eq!(data_profile.patterns.len(), 4);
30//!
31//! // prepare the generator
32//! data_profile.pre_generate();
33//!
34//! // generate some data
35//! println!("The generated name is {:?}", data_profile.generate());
36//! }
37//! ```
38//!
39//! You can also export (archive as JSON file) the profile for later use.
40//! This allows for the algorithm to be retrieved without having to store the actual data that was analyzed.
41//!
42//! ```
43//! extern crate test_data_generation;
44//!
45//! use test_data_generation::Profile;
46//!
47//! fn main() {
48//! //create a profile and analyze some data
49//! let mut old_profile = Profile::new();
50//! old_profile.analyze("Smith, John");
51//! old_profile.analyze("O'Brian, Henny");
52//! old_profile.analyze("Dale, Danny");
53//! old_profile.analyze("Rickets, Ronney");
54//!
55//! old_profile.pre_generate();
56//!
57//! //save the profile for later
58//! assert_eq!(old_profile.save("./tests/samples/sample-00-profile").unwrap(), true);
59//!
60//! // create a new profile from the archive json file
61//! let mut new_profile = Profile::from_file("./tests/samples/sample-00-profile");
62//!
63//! // generate some data. NOTE that the pre-generate() was already called prior to saving
64//! println!("The generated name is {:?}", new_profile.generate());
65//! }
66//! ```
67//!
68//! ### Data Sample Parser
69//!
70//! If you are using CSV files of data samples, then you may wish to use a Data Sample Parser.
71//! The `data_sample_parser` module provides functionality to read sample data, parse and analyze it, so that test data can be generated based on profiles.
72//!
73//! ```
74//! extern crate test_data_generation;
75//! use test_data_generation::data_sample_parser::DataSampleParser;
76//!
77//! fn main() {
78//! let mut dsp = DataSampleParser::new();
79//! dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None).unwrap();
80//!
81//! println!("My new name is {} {}", dsp.generate_record()[0], dsp.generate_record()[1]);
82//! // My new name is Abbon Aady
83//! }
84//! ```
85//!
86//! You can also save the Data Sample Parser (the algorithm) as an archive file (json) ...
87//!
88//! ```
89//! extern crate test_data_generation;
90//! use test_data_generation::data_sample_parser::DataSampleParser;
91//!
92//! fn main() {
93//! let mut dsp = DataSampleParser::new();
94//! dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None).unwrap();
95//!
96//! assert_eq!(dsp.save(&String::from("./tests/samples/sample-01-dsp")).unwrap(), true);
97//! }
98//! ```
99//!
100//! and use it at a later time.
101//!
102//! ```
103//! extern crate test_data_generation;
104//! use test_data_generation::data_sample_parser::DataSampleParser;
105//!
106//! fn main() {
107//! let mut dsp = DataSampleParser::from_file(&String::from("./tests/samples/sample-01-dsp"));
108//!
109//! println!("Sample data is {:?}", dsp.generate_record()[0]);
110//! }
111//! ```
112//!
113//! You can also generate a new csv file based on the data sample provided.
114//!
115//! ```
116//! extern crate test_data_generation;
117//!
118//! use test_data_generation::data_sample_parser::DataSampleParser;
119//!
120//! fn main() {
121//! let mut dsp = DataSampleParser::new();
122//!
123//! // Use the default delimiter (comma)
124//! dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None).unwrap();
125//! dsp.generate_csv(100, &String::from("./tests/samples/generated-01.csv"), None).unwrap();
126//! }
127//! ```
128#![crate_type = "lib"]
129#![crate_name = "test_data_generation"]
130
131#[macro_use]
132extern crate log;
133
134#[macro_use]
135extern crate serde_derive;
136extern crate crossbeam;
137extern crate csv;
138extern crate indexmap;
139extern crate levenshtein;
140extern crate rand;
141extern crate regex;
142extern crate serde;
143extern crate serde_json;
144extern crate serde_yaml;
145extern crate yaml_rust;
146
147use crate::engine::{Fact, PatternDefinition};
148use std::collections::BTreeMap;
149use std::fs::File;
150use std::io;
151use std::io::prelude::*;
152use std::io::Write;
153use std::ops::AddAssign;
154
155type PatternMap = BTreeMap<String, u32>;
156type SizeMap = BTreeMap<u32, u32>;
157type SizeRankMap = BTreeMap<u32, f64>;
158
159#[derive(Clone, Serialize, Deserialize, Debug)]
160/// Represents a Profile for sample data that has been analyzed and can be used to generate realistic data
161pub struct Profile {
162 /// An identifier (not necessarily unique) that is used to differentiate profiles from one another
163 pub id: Option<String>,
164 /// A list of symbolic patterns with a distinct count of occurrences
165 pub patterns: PatternMap,
166 /// The total number of patterns in the profile
167 pub pattern_total: u32,
168 /// A list of symbolic patterns in the profile
169 /// (used for temporary storage due to lifetime issues)
170 pub pattern_keys: Vec<String>,
171 /// A list of distinct counts for patterns in the profile
172 /// (used for temporary storage due to lifetime issues)
173 pub pattern_vals: Vec<u32>,
174 /// A list of symbolic patterns with their percent chance of occurrence
175 pub pattern_percentages: Vec<(String, f64)>,
176 /// A list of symbolic patterns with a running total of percent chance of occurrence, in increasing order
177 pub pattern_ranks: Vec<(String, f64)>,
178 /// A list of pattern lengths with a distinct count of occurrence
179 pub sizes: SizeMap,
180 /// the total number of pattern sizes (lengths) in the profile
181 pub size_total: u32,
182 /// A list of pattern sizes (lengths) with a running total of their percent chance of occurrence, in increasing order
183 pub size_ranks: Vec<(u32, f64)>,
184 /// The number of processors used to distribute the work load (multi-thread) while finding Facts to generate data
185 pub processors: u8,
186 /// A list of processors (which are lists of Facts) that store all the Facts in the profile
187 pub facts: Vec<Vec<Fact>>,
188}
189
190impl Profile {
191 /// Constructs a new Profile
192 ///
193 /// #Example
194 ///
195 /// ```rust
196 /// extern crate test_data_generation;
197 ///
198 /// use test_data_generation::Profile;
199 ///
200 /// fn main() {
201 /// let placeholder = Profile::new();
202 /// }
203 /// ```
204 pub fn new() -> Profile {
205 Profile {
206 id: None,
207 patterns: PatternMap::new(),
208 pattern_total: 0,
209 pattern_keys: Vec::new(),
210 pattern_vals: Vec::new(),
211 pattern_percentages: Vec::new(),
212 pattern_ranks: Vec::new(),
213 sizes: SizeMap::new(),
214 size_total: 0,
215 size_ranks: Vec::new(),
216 processors: 4,
217 facts: Profile::new_facts(4),
218 }
219 }
220
221 /// Constructs a new Profile using an identifier
222 ///
223 /// #Example
224 ///
225 /// ```rust
226 /// extern crate test_data_generation;
227 ///
228 /// use test_data_generation::Profile;
229 ///
230 /// fn main() {
231 /// let placeholder = Profile::new_with_id("12345".to_string());
232 /// }
233 /// ```
234 pub fn new_with_id(id: String) -> Profile {
235 Profile {
236 id: Some(id),
237 patterns: PatternMap::new(),
238 pattern_total: 0,
239 pattern_keys: Vec::new(),
240 pattern_vals: Vec::new(),
241 pattern_percentages: Vec::new(),
242 pattern_ranks: Vec::new(),
243 sizes: SizeMap::new(),
244 size_total: 0,
245 size_ranks: Vec::new(),
246 processors: 4,
247 facts: Profile::new_facts(4),
248 }
249 }
250
251 /// Constructs a new Profile with a specified number of processors to analyze the data.
252 /// Each processor shares the load of generating the data based on the Facts it has been assigned to manage.
253 ///
254 /// # Arguments
255 ///
256 /// * `p: u8` - A number that sets the number of processors to start up to manage the Facts.</br>
257 /// Increasing the number of processors will speed up the generator be distributing the workload.
258 /// The recommended number of processors is 1 per 10K data points (e.g.: profiling 20K names should be handled by 2 processors)</br>
259 /// NOTE: The default number of processors is 4.
260 ///
261 /// #Example
262 ///
263 /// ```rust
264 /// extern crate test_data_generation;
265 ///
266 /// use test_data_generation::Profile;
267 ///
268 /// fn main() {
269 /// let processors: u8 = 10;
270 /// let placeholder = Profile::new_with_processors(processors);
271 /// }
272 /// ```
273 pub fn new_with_processors(p: u8) -> Profile {
274 Profile {
275 id: None,
276 patterns: PatternMap::new(),
277 pattern_total: 0,
278 pattern_keys: Vec::new(),
279 pattern_vals: Vec::new(),
280 pattern_percentages: Vec::new(),
281 pattern_ranks: Vec::new(),
282 sizes: SizeMap::new(),
283 size_total: 0,
284 size_ranks: Vec::new(),
285 processors: p,
286 facts: Profile::new_facts(p),
287 }
288 }
289
290 /// Constructs a new Profile from an exported JSON file. This is used when restoring from "archive"
291 ///
292 /// # Arguments
293 ///
294 /// * `path: &str` - The full path of the export file , excluding the file extension, (e.g.: "./test/data/custom-names").</br>
295 ///
296 /// #Example
297 ///
298 /// ```rust
299 /// extern crate test_data_generation;
300 ///
301 /// use test_data_generation::Profile;
302 ///
303 /// fn main() {
304 /// let mut profile = Profile::from_file("./tests/samples/sample-00-profile");
305 ///
306 /// profile.pre_generate();
307 ///
308 /// println!("The generated name is {:?}", profile.generate());
309 /// }
310 /// ```
311 pub fn from_file(path: &'static str) -> Profile {
312 // open the archive file
313 let mut file = match File::open(format!("{}.json", &path)) {
314 Err(_e) => {
315 error!("Could not open file {:?}", &path.to_string());
316 panic!("Could not open file {:?}", &path.to_string());
317 }
318 Ok(f) => {
319 info!("Successfully opened file {:?}", &path.to_string());
320 f
321 }
322 };
323
324 //read the archive file
325 let mut serialized = String::new();
326 match file.read_to_string(&mut serialized) {
327 Err(e) => {
328 error!(
329 "Could not read file {:?} because of {:?}",
330 &path.to_string(),
331 e.to_string()
332 );
333 panic!(
334 "Could not read file {:?} because of {:?}",
335 &path.to_string(),
336 e.to_string()
337 );
338 }
339 Ok(s) => {
340 info!("Successfully read file {:?}", &path.to_string());
341 s
342 }
343 };
344
345 //serde_json::from_str(&serialized).unwrap()
346 Self::from_serialized(&serialized)
347 }
348
349 /// Constructs a new Profile from a serialized (JSON) string of the Profile object. This is used when restoring from "archive"
350 ///
351 /// #Example
352 ///
353 /// ```rust
354 /// extern crate test_data_generation;
355 ///
356 /// use test_data_generation::Profile;
357 ///
358 /// fn main() {
359 /// let serialized = "{\"patterns\":{\"VC\":1},\"pattern_total\":1,\"pattern_keys\":[\"VC\"],\"pattern_vals\":[1],\"pattern_percentages\":[],\"pattern_ranks\":[],\"sizes\":{\"2\":1},\"size_total\":1,\"size_ranks\":[],\"processors\":4,\"facts\":[[{\"key\":\"O\",\"prior_key\":null,\"next_key\":\"K\",\"pattern_placeholder\":\"V\",\"starts_with\":1,\"ends_with\":0,\"index_offset\":0}],[{\"key\":\"K\",\"prior_key\":\"O\",\"next_key\":null,\"pattern_placeholder\":\"C\",\"starts_with\":0,\"ends_with\":1,\"index_offset\":1}],[],[]]}";
360 /// let mut profile = Profile::from_serialized(&serialized);
361 ///
362 /// profile.pre_generate();
363 ///
364 /// println!("The generated name is {:?}", profile.generate());
365 /// }
366 /// ```
367 pub fn from_serialized(serialized: &str) -> Profile {
368 serde_json::from_str(&serialized).unwrap()
369 }
370
371 /// This function converts an data point (&str) to a pattern and adds it to the profile
372 ///
373 /// # Arguments
374 ///
375 /// * `entity: String` - The textual str of the value to analyze.</br>
376 ///
377 /// # Example
378 ///
379 /// ```rust
380 /// extern crate test_data_generation;
381 ///
382 /// use test_data_generation::Profile;
383 ///
384 /// fn main() {
385 /// let mut profile = Profile::new();
386 /// profile.analyze("One");
387 /// profile.analyze("Two");
388 /// profile.analyze("Three");
389 /// profile.analyze("Four");
390 ///
391 /// assert_eq!(profile.patterns.len(), 4);
392 /// }
393 /// ```
394 pub fn analyze(&mut self, entity: &str) {
395 let rslt = PatternDefinition::new().analyze(entity);
396 let _t = self.apply_facts(rslt.0, rslt.1).map_err(|e| {
397 error!(
398 "Warning: Couldn't apply the pattern and facts for the entity {}!",
399 entity
400 );
401 e.to_string()
402 });
403 }
404
405 /// This function applies the pattern and list of Facts to the profile
406 ///
407 /// # Arguments
408 ///
409 /// * `pattern: String` - The string the represents the pattern of the entity that was analyzed.</br>
410 /// * `facts: Vec<Fact>` - A Vector containing the Facts based on the analysis (one for each char in the entity).</br>
411 ///
412 /// # Example
413 ///
414 /// ```rust
415 /// extern crate test_data_generation;
416 ///
417 /// use test_data_generation::engine::{Fact, PatternDefinition};
418 /// use test_data_generation::Profile;
419 ///
420 /// fn main() {
421 /// let mut profile = Profile::new();
422 /// let results = PatternDefinition::new().analyze("Word");
423 ///
424 /// assert_eq!(profile.apply_facts(results.0, results.1).unwrap(), 1);
425 /// }
426 /// ```
427 #[inline]
428 pub fn apply_facts(&mut self, pattern: String, facts: Vec<Fact>) -> Result<i32, String> {
429 // balance the storing of facts across all the vectors that can be processed in parallel
430 let mut i = 0;
431 for f in facts.into_iter() {
432 if i == self.processors {
433 i = 0;
434 }
435
436 self.facts[i as usize].push(f);
437 i += 1;
438 }
439
440 // store the pattern
441 AddAssign::add_assign(self.patterns.entry(pattern.to_string()).or_insert(0), 1);
442
443 // store the total number of patterns generated so far
444 self.pattern_total = self.patterns.values().sum::<u32>();
445
446 // analyze sizes
447 AddAssign::add_assign(self.sizes.entry(pattern.len() as u32).or_insert(0), 1);
448 self.size_total = self.sizes.values().sum::<u32>();
449
450 self.pattern_keys = self.patterns.keys().cloned().collect();
451 self.pattern_vals = self.patterns.values().cloned().collect();
452
453 Ok(1)
454 }
455
456 /// This function calculates the patterns to use by the chance they will occur (as cumulative percentage) in decreasing order
457 ///
458 /// # Example
459 ///
460 /// ```rust
461 /// extern crate test_data_generation;
462 ///
463 /// use test_data_generation::Profile;
464 ///
465 /// fn main() {
466 /// let mut profile = Profile::new();
467 ///
468 /// profile.analyze("Smith, John");
469 /// profile.analyze("O'Brian, Henny");
470 /// profile.analyze("Dale, Danny");
471 /// profile.analyze("Rickets, Ronnae");
472 /// profile.analyze("Richard, Richie");
473 /// profile.analyze("Roberts, Blake");
474 /// profile.analyze("Conways, Sephen");
475 ///
476 /// profile.pre_generate();
477 /// let test = [("CvccvccpSCvccvv".to_string(), 28.57142857142857 as f64), ("CcvccpSCvcc".to_string(), 42.857142857142854 as f64), ("CvccvccpSCvccvc".to_string(), 57.14285714285714 as f64), ("CvcvcccpSCcvcv".to_string(), 71.42857142857142 as f64), ("CvcvpSCvccc".to_string(), 85.7142857142857 as f64), ("V@CcvvcpSCvccc".to_string(), 99.99999999999997 as f64)];
478 ///
479 /// assert_eq!(profile.pattern_ranks, test);
480 /// }
481 /// ```
482 #[inline]
483 pub fn cum_patternmap(&mut self) {
484 // Reference: https://users.rust-lang.org/t/cannot-infer-an-appropriate-lifetime-for-autoref/13360/3
485
486 debug!("calculating the cumulative percentage of occurences for data point patterns...");
487
488 // calculate the percentage by patterns
489 // -> {"CcvccpSCvcc": 14.285714285714285, "CvccvccpSCvccvc": 14.285714285714285, "CvccvccpSCvccvv": 28.57142857142857, "CvcvcccpSCcvcv": 14.285714285714285, "CvcvpSCvccc": 14.285714285714285, "V~CcvvcpSCvccc": 14.285714285714285}
490 let n = self.patterns.len();
491
492 // see issue: https://github.com/dsietz/test-data-generation/issues/88
493 self.pattern_percentages.clear();
494
495 for m in 0..n {
496 self.pattern_percentages.push((
497 self.pattern_keys[m].clone(),
498 (self.pattern_vals[m] as f64 / self.pattern_total as f64) * 100.0,
499 ));
500 }
501
502 // sort the ranks by percentages in decreasing order
503 // -> [("CvccvccpSCvccvv", 28.57142857142857), ("CcvccpSCvcc", 14.285714285714285), ("CvccvccpSCvccvc", 14.285714285714285), ("CvcvcccpSCcvcv", 14.285714285714285), ("CvcvpSCvccc", 14.285714285714285), ("V~CcvvcpSCvccc", 14.285714285714285)]
504 self.pattern_percentages
505 .sort_by(|&(_, a), &(_, b)| b.partial_cmp(&a).unwrap());
506
507 // calculate the cumulative sum of the pattern rankings
508 // -> [("CvccvccpSCvccvv", 28.57142857142857), ("CcvccpSCvcc", 42.857142857142854), ("CvccvccpSCvccvc", 57.14285714285714), ("CvcvcccpSCcvcv", 71.42857142857142), ("CvcvpSCvccc", 85.7142857142857), ("V~CcvvcpSCvccc", 99.99999999999997)]
509 let mut rank: f64 = 0.00;
510
511 // see issue: https://github.com/dsietz/test-data-generation/issues/88
512 self.pattern_ranks.clear();
513
514 for pttrn in self.pattern_percentages.iter() {
515 let tmp = pttrn.1 + rank;
516 self.pattern_ranks.push((pttrn.0.clone(), tmp));
517 rank = tmp;
518 }
519 }
520
521 /// This function calculates the sizes to use by the chance they will occur (as cumulative percentage) in decreasing order
522 ///
523 /// # Example
524 ///
525 /// ```rust
526 /// extern crate test_data_generation;
527 ///
528 /// use test_data_generation::Profile;
529 ///
530 /// fn main() {
531 /// let mut profile = Profile::new();
532 /// profile.analyze("One");
533 /// profile.analyze("Two");
534 /// profile.analyze("Three");
535 /// profile.analyze("Four");
536 /// profile.analyze("Five");
537 /// profile.analyze("Six");
538 ///
539 /// profile.cum_sizemap();
540 ///
541 /// print!("The size ranks are {:?}", profile.size_ranks);
542 /// // The size ranks are [(3, 50), (4, 83.33333333333333), (5, 100)]
543 /// }
544 /// ```
545 #[inline]
546 pub fn cum_sizemap(&mut self) {
547 debug!("calculating the cumulative percentage of occurences for data point sizes...");
548 // calculate the percentage by sizes
549 // -> {11: 28.57142857142857, 14: 14.285714285714285, 15: 57.14285714285714}
550 let mut size_ranks = SizeRankMap::new();
551
552 for key in self.sizes.keys() {
553 size_ranks.insert(
554 *key,
555 (*self.sizes.get(key).unwrap() as f64 / self.size_total as f64) * 100.0,
556 );
557 }
558
559 // sort the ranks by percentages in decreasing order
560 // -> [(15, 57.14285714285714), (11, 28.57142857142857), (14, 14.285714285714285)]
561 let mut sizes = size_ranks.iter().collect::<Vec<_>>();
562 sizes.sort_by(|&(_, a), &(_, b)| b.partial_cmp(a).unwrap());
563
564 // calculate the cumulative sum of the size rankings
565 // -> [(15, 57.14285714285714), (11, 85.71428571428571), (14, 100)]
566 self.size_ranks = sizes
567 .iter()
568 .scan((0_u32, 0.00_f64), |state, &(&k, &v)| {
569 *state = (k, state.1 + &v);
570 Some(*state)
571 })
572 .collect::<Vec<(_, _)>>();
573 }
574
575 /// This function generates realistic test data based on the sampel data that was analyzed.
576 ///
577 /// # Example
578 ///
579 /// ```rust
580 /// extern crate test_data_generation;
581 ///
582 /// use test_data_generation::Profile;
583 ///
584 /// fn main() {
585 /// let mut profile = Profile::new();
586 ///
587 /// profile.analyze("One");
588 /// profile.analyze("Two");
589 /// profile.analyze("Three");
590 /// profile.analyze("Four");
591 /// profile.analyze("Five");
592 ///
593 /// profile.pre_generate();
594 ///
595 /// print!("The test data {:?} was generated.", profile.generate());
596 /// }
597 /// ```
598 #[inline]
599 pub fn generate(&mut self) -> String {
600 // 1. get a random number
601 let s: f64 = random_percentage!();
602
603 // 2. find the first pattern that falls within the percentage chance of occurring
604 // NOTE: The following 2 lines has been commented out because this doesn't need to
605 // happen since the patterns are already ranks by percent chance of occurring
606 // and therefore sizes (lengths) as well since the patterns include the full
607 // length of the entitiy analyzed.
608 //let size = self.size_ranks.iter().find(|&&x|&x.1 >= &s).unwrap().0;
609 //let pattern = self.pattern_ranks.iter().find(|x|&x.1 >= &s && x.0.len() == size as usize).unwrap().clone();
610 let pattern = self
611 .pattern_ranks
612 .iter()
613 .find(|x| &x.1 >= &s)
614 .unwrap()
615 .clone();
616
617 // lastly, generate the test data using facts that adhere to the pattern
618 self.generate_from_pattern(pattern.0)
619 }
620
621 /// This function generates realistic test data based on the sample data that was analyzed.
622 ///
623 /// # Arguments
624 ///
625 /// * `pattern: String` - The pattern to reference when generating the test data.</br>
626 ///
627 /// # Example
628 ///
629 /// ```rust
630 /// extern crate test_data_generation;
631 ///
632 /// use test_data_generation::Profile;
633 ///
634 /// fn main() {
635 /// let mut profile = Profile::new();
636 ///
637 /// profile.analyze("01/13/2017");
638 /// profile.analyze("11/24/2017");
639 /// profile.analyze("08/05/2017");
640 ///
641 /// profile.pre_generate();
642 ///
643 /// let generated = profile.generate_from_pattern("##p##p####".to_string());
644 ///
645 /// assert_eq!(generated.len(), 10);
646 /// }
647 /// ```
648 #[inline]
649 pub fn generate_from_pattern(&self, pattern: String) -> String {
650 let pattern_chars = pattern.chars().collect::<Vec<char>>();
651 let mut generated = String::new();
652 let prev_char = ' ';
653
654 // iterate through the chars in the pattern string
655 for (idx, ch) in pattern_chars.iter().enumerate() {
656 match crossbeam::scope(|scope| {
657 let c = ch;
658 let starts = if idx == 0 { 1 } else { 0 };
659 let ends = if idx == pattern_chars.len() - 1 { 1 } else { 0 };
660 let mut fact_options = vec![];
661 let prior_char = prev_char;
662
663 // iterate through the processors (vec) that hold the lists (vec) of facts
664 for v in &self.facts {
665 let selected_facts = scope.spawn(move |_| {
666 let mut facts = vec![];
667
668 // iterate through the list of facts
669 for value in v {
670 if value.starts_with == starts
671 && value.ends_with == ends
672 && value.pattern_placeholder == *c
673 && value.index_offset == idx as u32
674 {
675 facts.push(value.key);
676
677 // if the value.key's prior char matches the prior generated char, then weight the value.key
678 // to increase the chance of it being used when generated
679 if value.prior_key.unwrap_or(' ') == prior_char {
680 facts.push(value.key);
681 facts.push(value.key);
682 }
683
684 // if the value.key's index_offset matches the current index, then weight the value.key
685 // to increase the chance of it being used when generated
686 if value.index_offset == idx as u32 {
687 facts.push(value.key);
688 facts.push(value.key);
689 }
690 }
691 }
692
693 facts
694 });
695
696 //append the selected_facts to the fact_options
697 //fact_options.extend_from_slice(&selected_facts.join());
698 match selected_facts.join() {
699 Ok(sf) => fact_options.extend_from_slice(&sf),
700 Err(err) => {
701 error!("{:?}", err);
702 panic!("{:?}", err);
703 }
704 }
705 }
706
707 //select a fact to use as the generated char
708 let rnd_start = 0;
709 let rnd_end = fact_options.len() - 1;
710
711 if rnd_start >= rnd_end {
712 //generated.push(fact_options[0 as usize]);
713 fact_options[0_usize]
714 } else {
715 let x: u32 = random_between!(rnd_start, rnd_end);
716 //prev_char = fact_options[x as usize];
717 //generated.push(prev_char);
718 fact_options[x as usize]
719 }
720 }) {
721 Ok(c) => generated.push(c),
722 Err(err) => {
723 error!("{:?}", err);
724 panic!("{:?}", err);
725 }
726 }
727 }
728
729 generated
730 }
731
732 /// This function learns by measuring how realistic the test data it generates to the sample data that was provided.
733 ///
734 /// # Arguments
735 ///
736 /// * `control_list: Vec<String>` - The list of strings to compare against. This would be the real data from the data sample.</br>
737 ///
738 /// # Example
739 ///
740 /// ```rust
741 /// extern crate test_data_generation;
742 ///
743 /// use test_data_generation::Profile;
744 ///
745 /// fn main() {
746 /// let mut profil = Profile::new();
747 /// let sample_data = vec!("Smith, John".to_string(),"Doe, John".to_string(),"Dale, Danny".to_string(),"Rickets, Ronney".to_string());
748 ///
749 /// for sample in sample_data.iter().clone() {
750 /// profil.analyze(&sample);
751 /// }
752 ///
753 /// // in order to learn the profile must be prepared with pre_genrate()
754 /// // so it can generate data to learn from
755 /// profil.pre_generate();
756 ///
757 /// let learning = profil.learn_from_entity(sample_data).unwrap();
758 ///
759 /// assert_eq!(learning, true);
760 /// }
761 /// ```
762 pub fn learn_from_entity(&mut self, control_list: Vec<String>) -> Result<bool, String> {
763 for _n in 0..10 {
764 let experiment = self.generate();
765 let mut percent_similarity: Vec<f64> = Vec::new();
766
767 for control in control_list.iter().clone() {
768 debug!("Comparing {} with {} ...", &control, &experiment);
769 percent_similarity.push(self.realistic_test(&control, &experiment));
770 }
771
772 let percent =
773 percent_similarity.iter().sum::<f64>() as f64 / percent_similarity.len() as f64;
774 debug!("Percent similarity is {} ...", &percent);
775
776 if percent >= 80_f64 {
777 self.analyze(&experiment);
778 }
779 }
780
781 Ok(true)
782 }
783
784 /// This function calculates the levenshtein distance between 2 strings.
785 /// See: https://crates.io/crates/levenshtein
786 ///
787 /// # Arguments
788 ///
789 /// * `control: &String` - The string to compare against. This would be the real data from the data sample.</br>
790 /// * `experiment: &String` - The string to compare. This would be the generated data for which you want to find the distance.</br>
791 ///
792 /// #Example
793 ///
794 /// ```rust
795 /// extern crate test_data_generation;
796 ///
797 /// use test_data_generation::Profile;
798 ///
799 /// fn main() {
800 /// let mut profile = Profile::new();
801 ///
802 /// assert_eq!(profile.levenshtein_distance(&"kitten".to_string(), &"sitting".to_string()), 3 as usize);
803 /// }
804 ///
805 pub fn levenshtein_distance(&mut self, control: &String, experiment: &String) -> usize {
806 // https://docs.rs/levenshtein/1.0.3/levenshtein/fn.levenshtein.html
807 levenshtein_distance!(control, experiment)
808 }
809
810 /// This function calculates the percent difference between 2 strings.
811 ///
812 /// # Arguments
813 ///
814 /// * `control: &str` - The string to compare against. This would be the real data from the data sample.</br>
815 /// * `experiment: &str` - The string to compare. This would be the generated data for which you want to find the percent difference.</br>
816 ///
817 /// #Example
818 ///
819 /// ```rust
820 /// extern crate test_data_generation;
821 ///
822 /// use test_data_generation::Profile;
823 ///
824 /// fn main() {
825 /// let mut profile = Profile::new();
826 ///
827 /// assert_eq!(profile.realistic_test(&"kitten".to_string(), &"sitting".to_string()), 76.92307692307692 as f64);
828 /// }
829 ///
830 #[inline]
831 pub fn realistic_test(&mut self, control: &str, experiment: &str) -> f64 {
832 realistic_test!(control, experiment)
833 }
834
835 /// This function is called from within the implementated structure and returns a list processors (Vec) with empty lists (Vec) for their Facts.
836 /// Each processor shares the load of generating the data based on the Facts it has been assigned to manage.
837 ///
838 /// # Arguments
839 ///
840 /// * `p: u8` - A number that sets the number of processors to start up to manage the Facts.</br>
841 /// Increasing the number of processors will speed up the generator be ditributing the workload.
842 /// The recommended number of processors is 1 per 10K data points (e.g.: profiling 20K names should be handled by 2 processors)</br>
843 /// NOTE: The default number of processors is 4.
844 ///
845 #[inline]
846 fn new_facts(p: u8) -> Vec<Vec<Fact>> {
847 let mut vec_main = Vec::new();
848
849 for _ in 0..p {
850 vec_main.push(Vec::new());
851 }
852
853 vec_main
854 }
855
856 /// This function prepares the size a pattern accumulated percentages order by percentage increasing
857 ///
858 /// # Example
859 ///
860 /// ```rust
861 /// extern crate test_data_generation;
862 ///
863 /// use test_data_generation::Profile;
864 ///
865 /// fn main() {
866 /// let mut profile = Profile::new();
867 /// profile.analyze("One");
868 /// profile.analyze("Two");
869 /// profile.analyze("Three");
870 /// profile.analyze("Four");
871 /// profile.analyze("Five");
872 /// profile.analyze("Six");
873 ///
874 /// profile.pre_generate();
875 ///
876 /// print!("The size ranks are {:?}", profile.size_ranks);
877 /// // The size ranks are [(3, 50), (4, 83.33333333333333), (5, 100)]
878 /// }
879 /// ```
880 pub fn pre_generate(&mut self) {
881 info!("Preparing the profile for data generation...");
882 self.cum_sizemap();
883 self.cum_patternmap();
884 info!("Profile: preparing generator...");
885 }
886
887 /// This function resets the patterns that the Profile has analyzed.
888 /// Call this method whenever you wish to "clear" the Profile
889 ///
890 /// # Example
891 ///
892 /// ```rust
893 /// extern crate test_data_generation;
894 ///
895 /// use test_data_generation::Profile;
896 ///
897 /// fn main() {
898 /// let mut profile = Profile::new();
899 ///
900 /// profile.analyze("One");
901 /// profile.analyze("Two");
902 /// profile.analyze("Three");
903 ///
904 /// let x = profile.patterns.len();
905 ///
906 /// profile.reset_analyze();
907 ///
908 /// profile.analyze("Four");
909 /// profile.analyze("Five");
910 /// profile.analyze("Six");
911 /// profile.analyze("Seven");
912 /// profile.analyze("Eight");
913 /// profile.analyze("Nine");
914 /// profile.analyze("Ten");
915 ///
916 /// let y = profile.patterns.len();
917 ///
918 /// assert_eq!(x, 3);
919 /// assert_eq!(y, 5);
920 /// }
921 /// ```
922 pub fn reset_analyze(&mut self) {
923 info!("Resetting the profile ...");
924 self.patterns = PatternMap::new();
925 info!("Profile: patterns have been reset ...");
926 }
927
928 /// This function saves (exports) the Profile to a JSON file.
929 /// This is useful when you wish to reuse the algorithm to generate more test data later.
930 ///
931 /// # Arguments
932 ///
933 /// * `field: String` - The full path of the export file , excluding the file extension, (e.g.: "./test/data/custom-names").</br>
934 ///
935 /// #Errors
936 /// If this function encounters any form of I/O or other error, an error variant will be returned.
937 /// Otherwise, the function returns Ok(true).</br>
938 ///
939 /// #Example
940 ///
941 /// ```rust
942 /// extern crate test_data_generation;
943 ///
944 /// use test_data_generation::Profile;
945 ///
946 /// fn main() {
947 /// // analyze the dataset
948 /// let mut profile = Profile::new();
949 /// profile.analyze("Smith, John");
950 /// profile.analyze("O'Brian, Henny");
951 /// profile.analyze("Dale, Danny");
952 /// profile.analyze("Rickets, Ronney");
953 ///
954 /// profile.pre_generate();
955 ///
956 /// assert_eq!(profile.save("./tests/samples/sample-00-profile").unwrap(), true);
957 /// }
958 ///
959 pub fn save(&mut self, path: &'static str) -> Result<bool, io::Error> {
960 let dsp_json = serde_json::to_string(&self).unwrap();
961
962 // Create the archive file
963 let mut file = match File::create(format!("{}.json", &path)) {
964 Err(e) => {
965 error!("Could not create file {:?}", &path.to_string());
966 return Err(e);
967 }
968 Ok(f) => {
969 info!("Successfully exported to {:?}", &path.to_string());
970 f
971 }
972 };
973
974 // Write the json string to file, returns io::Result<()>
975 match file.write_all(dsp_json.as_bytes()) {
976 Err(e) => {
977 error!("Could not write to file {}", &path.to_string());
978 return Err(e);
979 }
980 Ok(_) => {
981 info!("Successfully exported to {}", &path.to_string());
982 }
983 };
984
985 Ok(true)
986 }
987
988 /// This function converts the Profile to a serialize JSON string.
989 ///
990 /// #Example
991 ///
992 /// ```rust
993 /// extern crate test_data_generation;
994 ///
995 /// use test_data_generation::Profile;
996 ///
997 /// fn main() {
998 /// // analyze the dataset
999 /// let mut data_profile = Profile::new();
1000 ///
1001 /// // analyze the dataset
1002 /// data_profile.analyze("OK");
1003 ///
1004 /// println!("{}", data_profile.serialize());
1005 /// // {"patterns":{"VC":1},"pattern_total":1,"pattern_keys":["VC"],"pattern_vals":[1],"pattern_percentages":[],"pattern_ranks":[],"sizes":{"2":1},"size_total":1,"size_ranks":[],"processors":4,"facts":[[{"key":"O","prior_key":null,"next_key":"K","pattern_placeholder":"V","starts_with":1,"ends_with":0,"index_offset":0}],[{"key":"K","prior_key":"O","next_key":null,"pattern_placeholder":"C","starts_with":0,"ends_with":1,"index_offset":1}],[],[]]}
1006 /// }
1007 ///
1008 pub fn serialize(&mut self) -> String {
1009 serde_json::to_string(&self).unwrap()
1010 }
1011}
1012
1013#[macro_use]
1014pub mod macros;
1015pub mod configs;
1016pub mod data_sample_parser;
1017pub mod engine;
1018pub mod shared;
1019
1020// Unit Tests
1021#[cfg(test)]
1022mod tests {
1023 use super::*;
1024
1025 #[test]
1026 fn apply_facts() {
1027 let mut profile = Profile::new();
1028 let results = PatternDefinition::new().analyze("Word");
1029
1030 assert_eq!(profile.apply_facts(results.0, results.1).unwrap(), 1);
1031 }
1032
1033 #[test]
1034 fn levenshtein_test() {
1035 let mut profil = Profile::new();
1036
1037 assert_eq!(
1038 profil.levenshtein_distance(&"kitten".to_string(), &"sitting".to_string()),
1039 3 as usize
1040 );
1041 }
1042
1043 #[test]
1044 fn realistic_data_test() {
1045 let mut profil = Profile::new();
1046
1047 assert_eq!(
1048 profil.realistic_test(&"kitten".to_string(), &"sitting".to_string()),
1049 76.92307692307692 as f64
1050 );
1051 }
1052
1053 #[test]
1054 fn learn_from_entity() {
1055 let mut profil = Profile::new();
1056 let sample_data = vec![
1057 "Smith, John".to_string(),
1058 "Doe, John".to_string(),
1059 "Dale, Danny".to_string(),
1060 "Rickets, Ronney".to_string(),
1061 ];
1062
1063 for sample in sample_data.iter().clone() {
1064 profil.analyze(&sample);
1065 }
1066
1067 profil.pre_generate();
1068
1069 let learning = profil.learn_from_entity(sample_data).unwrap();
1070
1071 assert_eq!(learning, true);
1072 }
1073
1074 #[test]
1075 fn logging_test() {
1076 let mut profile = Profile::new();
1077 profile.reset_analyze();
1078
1079 assert!(true);
1080 }
1081
1082 #[test]
1083 fn new_profile_with_id() {
1084 let mut profile = Profile::new_with_id("12345".to_string());
1085 profile.pre_generate();
1086
1087 assert_eq!(profile.id.unwrap(), "12345".to_string());
1088 }
1089
1090 #[test]
1091 fn new_profile_from_file() {
1092 let mut profile = Profile::from_file("./tests/samples/sample-00-profile");
1093 profile.pre_generate();
1094
1095 assert!(profile.generate().len() > 0);
1096 }
1097
1098 #[test]
1099 #[should_panic]
1100 fn new_profile_from_file_bad_data() {
1101 let mut profile = Profile::from_file("./tests/samples/not-readable");
1102 profile.pre_generate();
1103
1104 assert!(profile.generate().len() > 0);
1105 }
1106
1107 #[test]
1108 #[should_panic(expected = "Could not open file \"./tests/samples/bad-path\"")]
1109 fn new_profile_from_file_bad_path() {
1110 let mut profile = Profile::from_file("./tests/samples/bad-path");
1111 profile.pre_generate();
1112
1113 assert!(profile.generate().len() > 0);
1114 }
1115
1116 #[test]
1117 fn new_profile_from_serialized() {
1118 let serialized = "{\"patterns\":{\"VC\":1},\"pattern_total\":1,\"pattern_keys\":[\"VC\"],\"pattern_vals\":[1],\"pattern_percentages\":[],\"pattern_ranks\":[],\"sizes\":{\"2\":1},\"size_total\":1,\"size_ranks\":[],\"processors\":4,\"facts\":[[{\"key\":\"O\",\"prior_key\":null,\"next_key\":\"K\",\"pattern_placeholder\":\"V\",\"starts_with\":1,\"ends_with\":0,\"index_offset\":0}],[{\"key\":\"K\",\"prior_key\":\"O\",\"next_key\":null,\"pattern_placeholder\":\"C\",\"starts_with\":0,\"ends_with\":1,\"index_offset\":1}],[],[]]}";
1119 let mut profile = Profile::from_serialized(&serialized);
1120 profile.pre_generate();
1121
1122 assert_eq!(profile.generate(), "OK");
1123 }
1124
1125 #[test]
1126 fn new_profile_new_with() {
1127 let profile = Profile::new_with_processors(10);
1128
1129 assert_eq!(profile.processors, 10);
1130 }
1131
1132 #[test]
1133 // ensure Profile is analyzing all the sample data points
1134 fn profile_analyze() {
1135 let mut profil = Profile::new();
1136 profil.analyze("Smith, John");
1137 profil.analyze("O'Brian, Henny");
1138 profil.analyze("Dale, Danny");
1139 profil.analyze("Rickets, Ronney");
1140
1141 assert_eq!(profil.patterns.len(), 4);
1142 }
1143
1144 #[test]
1145 // ensure Profile is able to find the facts that relate to a pattern
1146 // NOTE: Dates need work! e.g.: 00/15/0027
1147 fn profile_generate_from_pattern_date() {
1148 let mut profil = Profile::new();
1149 profil.analyze("01/13/2017");
1150 profil.analyze("11/24/2017");
1151 profil.analyze("08/05/2017");
1152
1153 profil.pre_generate();
1154 let generated = profil.generate_from_pattern("##p##p####".to_string());
1155
1156 assert_eq!(10, generated.len());
1157 }
1158
1159 #[test]
1160 // ensure Profile is able to find the facts that relate to a pattern
1161 fn profile_generate_from_pattern_string() {
1162 let mut profil = Profile::new();
1163 profil.analyze("First");
1164 profil.analyze("Next");
1165 profil.analyze("Last");
1166
1167 profil.pre_generate();
1168 let generated = profil.generate_from_pattern("Cvcc".to_string());
1169
1170 assert_eq!(4, generated.len());
1171 }
1172
1173 #[test]
1174 // ensure Profile is generating correct test data
1175 fn profile_generate() {
1176 let mut profil = Profile::new();
1177 profil.analyze("Smith, John");
1178 profil.analyze("O'Brian, Henny");
1179 profil.analyze("Dale, Danny");
1180 profil.analyze("Rickets, Ronnae");
1181 profil.analyze("Richard, Richie");
1182 profil.analyze("Roberts, Blake");
1183 profil.analyze("Conways, Sephen");
1184
1185 profil.pre_generate();
1186
1187 assert!(profil.generate().len() > 10);
1188 }
1189
1190 #[test]
1191 // issue #31
1192 // ensure Profile doesn't generate a name with a backslash preceding an apostrophe
1193 fn profile_generate_with_apostrophe() {
1194 let mut profil = Profile::new();
1195 profil.analyze("O'Brien");
1196
1197 profil.pre_generate();
1198 let generated = profil.generate();
1199
1200 assert_eq!(generated, "O'Brien");
1201 }
1202
1203 #[test]
1204 // ensure Profile is providing the correct pattern ranks after analyzing the sample data
1205 fn profile_pregenerate_patterns() {
1206 let mut profil = Profile::new();
1207 profil.analyze("Smith, John");
1208 profil.analyze("O'Brian, Henny");
1209 profil.analyze("Dale, Danny");
1210 profil.analyze("Rickets, Ronnae");
1211 profil.analyze("Richard, Richie");
1212 profil.analyze("Roberts, Blake");
1213 profil.analyze("Conways, Sephen");
1214
1215 profil.pre_generate();
1216 let test = [
1217 ("CvccvccpSCvccvv".to_string(), 28.57142857142857 as f64),
1218 ("CcvccpSCvcc".to_string(), 42.857142857142854 as f64),
1219 ("CvccvccpSCvccvc".to_string(), 57.14285714285714 as f64),
1220 ("CvcvcccpSCcvcv".to_string(), 71.42857142857142 as f64),
1221 ("CvcvpSCvccc".to_string(), 85.7142857142857 as f64),
1222 ("V@CcvvcpSCvccc".to_string(), 99.99999999999997 as f64),
1223 ];
1224
1225 assert_eq!(profil.pattern_ranks, test);
1226 }
1227
1228 #[test]
1229 // ensure Profile is providing the correct pattern ranks after analyzing the sample data
1230 fn profile_pregenerate_sizes() {
1231 let mut profil = Profile::new();
1232
1233 profil.analyze("Smith, Johny"); //12
1234 profil.analyze("O'Brian, Hen"); //12
1235 profil.analyze("Dale, Danny"); //11
1236 profil.analyze("O'Henry, Al"); //11
1237 profil.analyze("Rickets, Ro"); //11
1238 profil.analyze("Mr. Wilbers"); //11
1239 profil.analyze("Po, Al"); //6
1240
1241 profil.pre_generate();
1242 let test = [
1243 (11, 57.14285714285714),
1244 (12, 85.71428571428571),
1245 (6, 100 as f64),
1246 ];
1247
1248 assert_eq!(profil.size_ranks, test);
1249 }
1250
1251 #[test]
1252 fn save_profile() {
1253 let mut profile = Profile::new();
1254 profile.analyze("Smith, John");
1255 profile.analyze("O'Brian, Henny");
1256 profile.analyze("Dale, Danny");
1257 profile.analyze("Rickets, Ronney");
1258
1259 profile.pre_generate();
1260
1261 assert_eq!(
1262 profile.save("./tests/samples/sample-00-profile").unwrap(),
1263 true
1264 );
1265 }
1266
1267 #[test]
1268 // ensure a Profile can be exported (to be archived) as JSON
1269 fn serialize() {
1270 let mut profil = Profile::new();
1271
1272 // analyze the dataset
1273 profil.analyze("OK");
1274
1275 let serialized = profil.serialize();
1276 assert_eq!(serialized, "{\"id\":null,\"patterns\":{\"VC\":1},\"pattern_total\":1,\"pattern_keys\":[\"VC\"],\"pattern_vals\":[1],\"pattern_percentages\":[],\"pattern_ranks\":[],\"sizes\":{\"2\":1},\"size_total\":1,\"size_ranks\":[],\"processors\":4,\"facts\":[[{\"key\":\"O\",\"prior_key\":null,\"next_key\":\"K\",\"pattern_placeholder\":\"V\",\"starts_with\":1,\"ends_with\":0,\"index_offset\":0}],[{\"key\":\"K\",\"prior_key\":\"O\",\"next_key\":null,\"pattern_placeholder\":\"C\",\"starts_with\":0,\"ends_with\":1,\"index_offset\":1}],[],[]]}");
1277 }
1278}