test_data_generation/
lib.rs

1// Copyright 2018 David Sietz and [`test-data-generator` contributors](https://github.com/dsietz/test-data-generator/blob/master/CONTRIBUTORS.md).
2// Licensed under the MIT license
3// (see LICENSE or <https://opensource.org/licenses/Apache-2.0>)
4//
5//!
6//! The are multiple ways to use the Test Data Generation library. It all depends on your intent.
7//!
8//! ### Profile
9//!
10//! The easiest way is to use a Profile. The `profile` module provides functionality to create a profile on a data sample (Strings).
11//! Once a profile has been made, data can be generated by calling the _pre_generate()_ and _generate()_ functions, in that order.
12//!
13//! ```
14//! extern crate test_data_generation;
15//!
16//! use test_data_generation::Profile;
17//!
18//! fn main() {
19//!     // analyze the dataset
20//! 	let mut data_profile =  Profile::new();
21//!
22//!     // analyze the dataset
23//! 	data_profile.analyze("Smith, John");
24//! 	data_profile.analyze("Doe, John");
25//! 	data_profile.analyze("Dale, Danny");
26//! 	data_profile.analyze("Rickets, Ronney");
27//!
28//!     // confirm 4 data samples were analyzed
29//!    	assert_eq!(data_profile.patterns.len(), 4);
30//!
31//!     // prepare the generator
32//!     data_profile.pre_generate();
33//!
34//!     // generate some data
35//!    	println!("The generated name is {:?}", data_profile.generate());
36//! }
37//! ```
38//!
39//! You can also export (archive as JSON file) the profile for later use.
40//! This allows for the algorithm to be retrieved without having to store the actual data that was analyzed.
41//!
42//!	```
43//! extern crate test_data_generation;
44//!
45//! use test_data_generation::Profile;
46//!
47//! fn main() {
48//!		//create a profile and analyze some data
49//!		let mut old_profile =  Profile::new();
50//!		old_profile.analyze("Smith, John");
51//!		old_profile.analyze("O'Brian, Henny");
52//!		old_profile.analyze("Dale, Danny");
53//!		old_profile.analyze("Rickets, Ronney");
54//!
55//!		old_profile.pre_generate();
56//!
57//!		//save the profile for later
58//!		assert_eq!(old_profile.save("./tests/samples/sample-00-profile").unwrap(), true);
59//!
60//!		// create a new profile from the archive json file
61//!		let mut new_profile = Profile::from_file("./tests/samples/sample-00-profile");
62//!
63//!		// generate some data. NOTE that the pre-generate() was already called prior to saving
64//!     println!("The generated name is {:?}", new_profile.generate());
65//! }
66//! ```
67//!
68//! ### Data Sample Parser
69//!
70//! If you are using CSV files of data samples, then you may wish to use a Data Sample Parser.
71//! The `data_sample_parser` module provides functionality to read sample data, parse and analyze it, so that test data can be generated based on profiles.
72//!
73//! ```
74//! extern crate test_data_generation;
75//! use test_data_generation::data_sample_parser::DataSampleParser;
76//!
77//! fn main() {
78//!     let mut dsp = DataSampleParser::new();
79//!     dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None).unwrap();
80//!
81//!     println!("My new name is {} {}", dsp.generate_record()[0], dsp.generate_record()[1]);
82//!     // My new name is Abbon Aady
83//! }
84//! ```
85//!
86//! You can also save the Data Sample Parser (the algorithm) as an archive file (json) ...
87//!
88//! ```
89//! extern crate test_data_generation;
90//! use test_data_generation::data_sample_parser::DataSampleParser;
91//!
92//! fn main() {
93//!     let mut dsp =  DataSampleParser::new();
94//!     dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None).unwrap();
95//!
96//!     assert_eq!(dsp.save(&String::from("./tests/samples/sample-01-dsp")).unwrap(), true);
97//! }
98//! ```
99//!
100//! and use it at a later time.
101//!
102//! ```
103//! extern crate test_data_generation;
104//! use test_data_generation::data_sample_parser::DataSampleParser;
105//!
106//! fn main() {
107//!     let mut dsp = DataSampleParser::from_file(&String::from("./tests/samples/sample-01-dsp"));
108//!
109//! 	println!("Sample data is {:?}", dsp.generate_record()[0]);
110//! }
111//! ```
112//!
113//! You can also generate a new csv file based on the data sample provided.
114//!
115//! ```
116//! extern crate test_data_generation;
117//!
118//! use test_data_generation::data_sample_parser::DataSampleParser;
119//!
120//! fn main() {
121//!     let mut dsp =  DataSampleParser::new();
122//!
123//!     // Use the default delimiter (comma)
124//!    	dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None).unwrap();
125//!    	dsp.generate_csv(100, &String::from("./tests/samples/generated-01.csv"), None).unwrap();
126//! }
127//! ```
128#![crate_type = "lib"]
129#![crate_name = "test_data_generation"]
130
131#[macro_use]
132extern crate log;
133
134#[macro_use]
135extern crate serde_derive;
136extern crate crossbeam;
137extern crate csv;
138extern crate indexmap;
139extern crate levenshtein;
140extern crate rand;
141extern crate regex;
142extern crate serde;
143extern crate serde_json;
144extern crate serde_yaml;
145extern crate yaml_rust;
146
147use crate::engine::{Fact, PatternDefinition};
148use std::collections::BTreeMap;
149use std::fs::File;
150use std::io;
151use std::io::prelude::*;
152use std::io::Write;
153use std::ops::AddAssign;
154
155type PatternMap = BTreeMap<String, u32>;
156type SizeMap = BTreeMap<u32, u32>;
157type SizeRankMap = BTreeMap<u32, f64>;
158
159#[derive(Clone, Serialize, Deserialize, Debug)]
160/// Represents a Profile for sample data that has been analyzed and can be used to generate realistic data
161pub struct Profile {
162    /// An identifier (not necessarily unique) that is used to differentiate profiles from one another
163    pub id: Option<String>,
164    /// A list of symbolic patterns with a distinct count of occurrences
165    pub patterns: PatternMap,
166    /// The total number of patterns in the profile
167    pub pattern_total: u32,
168    /// A list of symbolic patterns in the profile
169    /// (used for temporary storage due to lifetime issues)
170    pub pattern_keys: Vec<String>,
171    /// A list of distinct counts for patterns in the profile
172    /// (used for temporary storage due to lifetime issues)
173    pub pattern_vals: Vec<u32>,
174    /// A list of symbolic patterns with their percent chance of occurrence
175    pub pattern_percentages: Vec<(String, f64)>,
176    /// A list of symbolic patterns with a running total of percent chance of occurrence, in increasing order
177    pub pattern_ranks: Vec<(String, f64)>,
178    /// A list of pattern lengths with a distinct count of occurrence
179    pub sizes: SizeMap,
180    /// the total number of pattern sizes (lengths) in the profile
181    pub size_total: u32,
182    /// A list of pattern sizes (lengths) with a running total of their percent chance of occurrence, in increasing order
183    pub size_ranks: Vec<(u32, f64)>,
184    /// The number of processors used to distribute the work load (multi-thread) while finding Facts to generate data
185    pub processors: u8,
186    /// A list of processors (which are lists of Facts) that store all the Facts in the profile
187    pub facts: Vec<Vec<Fact>>,
188}
189
190impl Profile {
191    /// Constructs a new Profile
192    ///
193    /// #Example
194    ///
195    /// ```rust
196    /// extern crate test_data_generation;
197    ///
198    /// use test_data_generation::Profile;
199    ///
200    /// fn main() {
201    /// 	let placeholder = Profile::new();
202    /// }
203    /// ```
204    pub fn new() -> Profile {
205        Profile {
206            id: None,
207            patterns: PatternMap::new(),
208            pattern_total: 0,
209            pattern_keys: Vec::new(),
210            pattern_vals: Vec::new(),
211            pattern_percentages: Vec::new(),
212            pattern_ranks: Vec::new(),
213            sizes: SizeMap::new(),
214            size_total: 0,
215            size_ranks: Vec::new(),
216            processors: 4,
217            facts: Profile::new_facts(4),
218        }
219    }
220
221    /// Constructs a new Profile using an identifier
222    ///
223    /// #Example
224    ///
225    /// ```rust
226    /// extern crate test_data_generation;
227    ///
228    /// use test_data_generation::Profile;
229    ///
230    /// fn main() {
231    /// 	let placeholder = Profile::new_with_id("12345".to_string());
232    /// }
233    /// ```
234    pub fn new_with_id(id: String) -> Profile {
235        Profile {
236            id: Some(id),
237            patterns: PatternMap::new(),
238            pattern_total: 0,
239            pattern_keys: Vec::new(),
240            pattern_vals: Vec::new(),
241            pattern_percentages: Vec::new(),
242            pattern_ranks: Vec::new(),
243            sizes: SizeMap::new(),
244            size_total: 0,
245            size_ranks: Vec::new(),
246            processors: 4,
247            facts: Profile::new_facts(4),
248        }
249    }
250
251    /// Constructs a new Profile with a specified number of processors to analyze the data.
252    /// Each processor shares the load of generating the data based on the Facts it has been assigned to manage.
253    ///
254    /// # Arguments
255    ///
256    /// * `p: u8` - A number that sets the number of processors to start up to manage the Facts.</br>
257    ///         Increasing the number of processors will speed up the generator be distributing the workload.
258    ///         The recommended number of processors is 1 per 10K data points (e.g.: profiling 20K names should be handled by 2 processors)</br>
259    ///         NOTE: The default number of processors is 4.
260    ///
261    /// #Example
262    ///
263    /// ```rust
264    /// extern crate test_data_generation;
265    ///
266    /// use test_data_generation::Profile;
267    ///
268    /// fn main() {
269    ///     let processors: u8 = 10;
270    /// 	let placeholder = Profile::new_with_processors(processors);
271    /// }
272    /// ```
273    pub fn new_with_processors(p: u8) -> Profile {
274        Profile {
275            id: None,
276            patterns: PatternMap::new(),
277            pattern_total: 0,
278            pattern_keys: Vec::new(),
279            pattern_vals: Vec::new(),
280            pattern_percentages: Vec::new(),
281            pattern_ranks: Vec::new(),
282            sizes: SizeMap::new(),
283            size_total: 0,
284            size_ranks: Vec::new(),
285            processors: p,
286            facts: Profile::new_facts(p),
287        }
288    }
289
290    /// Constructs a new Profile from an exported JSON file. This is used when restoring from "archive"
291    ///
292    /// # Arguments
293    ///
294    /// * `path: &str` - The full path of the export file , excluding the file extension, (e.g.: "./test/data/custom-names").</br>
295    ///
296    /// #Example
297    ///
298    /// ```rust
299    /// extern crate test_data_generation;
300    ///
301    /// use test_data_generation::Profile;
302    ///
303    /// fn main() {
304    ///		let mut profile = Profile::from_file("./tests/samples/sample-00-profile");
305    ///
306    ///     profile.pre_generate();
307    ///
308    ///     println!("The generated name is {:?}", profile.generate());
309    /// }
310    /// ```
311    pub fn from_file(path: &'static str) -> Profile {
312        // open the archive file
313        let mut file = match File::open(format!("{}.json", &path)) {
314            Err(_e) => {
315                error!("Could not open file {:?}", &path.to_string());
316                panic!("Could not open file {:?}", &path.to_string());
317            }
318            Ok(f) => {
319                info!("Successfully opened file {:?}", &path.to_string());
320                f
321            }
322        };
323
324        //read the archive file
325        let mut serialized = String::new();
326        match file.read_to_string(&mut serialized) {
327            Err(e) => {
328                error!(
329                    "Could not read file {:?} because of {:?}",
330                    &path.to_string(),
331                    e.to_string()
332                );
333                panic!(
334                    "Could not read file {:?} because of {:?}",
335                    &path.to_string(),
336                    e.to_string()
337                );
338            }
339            Ok(s) => {
340                info!("Successfully read file {:?}", &path.to_string());
341                s
342            }
343        };
344
345        //serde_json::from_str(&serialized).unwrap()
346        Self::from_serialized(&serialized)
347    }
348
349    /// Constructs a new Profile from a serialized (JSON) string of the Profile object. This is used when restoring from "archive"
350    ///
351    /// #Example
352    ///
353    /// ```rust
354    /// extern crate test_data_generation;
355    ///
356    /// use test_data_generation::Profile;
357    ///
358    /// fn main() {
359    ///		let serialized = "{\"patterns\":{\"VC\":1},\"pattern_total\":1,\"pattern_keys\":[\"VC\"],\"pattern_vals\":[1],\"pattern_percentages\":[],\"pattern_ranks\":[],\"sizes\":{\"2\":1},\"size_total\":1,\"size_ranks\":[],\"processors\":4,\"facts\":[[{\"key\":\"O\",\"prior_key\":null,\"next_key\":\"K\",\"pattern_placeholder\":\"V\",\"starts_with\":1,\"ends_with\":0,\"index_offset\":0}],[{\"key\":\"K\",\"prior_key\":\"O\",\"next_key\":null,\"pattern_placeholder\":\"C\",\"starts_with\":0,\"ends_with\":1,\"index_offset\":1}],[],[]]}";
360    ///		let mut profile = Profile::from_serialized(&serialized);
361    ///
362    ///     profile.pre_generate();
363    ///
364    ///     println!("The generated name is {:?}", profile.generate());
365    /// }
366    /// ```
367    pub fn from_serialized(serialized: &str) -> Profile {
368        serde_json::from_str(&serialized).unwrap()
369    }
370
371    /// This function converts an data point (&str) to a pattern and adds it to the profile
372    ///
373    /// # Arguments
374    ///
375    /// * `entity: String` - The textual str of the value to analyze.</br>
376    ///
377    /// # Example
378    ///
379    /// ```rust
380    /// extern crate test_data_generation;
381    ///
382    /// use test_data_generation::Profile;
383    ///
384    /// fn main() {
385    /// 	let mut profile =  Profile::new();
386    ///		profile.analyze("One");
387    ///		profile.analyze("Two");
388    ///		profile.analyze("Three");
389    ///		profile.analyze("Four");
390    ///
391    ///		assert_eq!(profile.patterns.len(), 4);
392    /// }
393    /// ```
394    pub fn analyze(&mut self, entity: &str) {
395        let rslt = PatternDefinition::new().analyze(entity);
396        let _t = self.apply_facts(rslt.0, rslt.1).map_err(|e| {
397            error!(
398                "Warning: Couldn't apply the pattern and facts for the entity {}!",
399                entity
400            );
401            e.to_string()
402        });
403    }
404
405    /// This function applies the pattern and list of Facts  to the profile
406    ///
407    /// # Arguments
408    ///
409    /// * `pattern: String` - The string the represents the pattern of the entity that was analyzed.</br>
410    /// * `facts: Vec<Fact>` - A Vector containing the Facts based on the analysis (one for each char in the entity).</br>
411    ///
412    /// # Example
413    ///
414    /// ```rust
415    /// extern crate test_data_generation;
416    ///
417    /// use test_data_generation::engine::{Fact, PatternDefinition};
418    /// use test_data_generation::Profile;
419    ///
420    /// fn main() {
421    /// 	let mut profile =  Profile::new();
422    ///		let results = PatternDefinition::new().analyze("Word");
423    ///
424    ///		assert_eq!(profile.apply_facts(results.0, results.1).unwrap(), 1);
425    /// }
426    /// ```
427    #[inline]
428    pub fn apply_facts(&mut self, pattern: String, facts: Vec<Fact>) -> Result<i32, String> {
429        // balance the storing of facts across all the vectors that can be processed in parallel
430        let mut i = 0;
431        for f in facts.into_iter() {
432            if i == self.processors {
433                i = 0;
434            }
435
436            self.facts[i as usize].push(f);
437            i += 1;
438        }
439
440        // store the pattern
441        AddAssign::add_assign(self.patterns.entry(pattern.to_string()).or_insert(0), 1);
442
443        // store the total number of patterns generated so far
444        self.pattern_total = self.patterns.values().sum::<u32>();
445
446        // analyze sizes
447        AddAssign::add_assign(self.sizes.entry(pattern.len() as u32).or_insert(0), 1);
448        self.size_total = self.sizes.values().sum::<u32>();
449
450        self.pattern_keys = self.patterns.keys().cloned().collect();
451        self.pattern_vals = self.patterns.values().cloned().collect();
452
453        Ok(1)
454    }
455
456    /// This function calculates the patterns to use by the chance they will occur (as cumulative percentage) in decreasing order
457    ///
458    /// # Example
459    ///
460    /// ```rust
461    /// extern crate test_data_generation;
462    ///
463    /// use test_data_generation::Profile;
464    ///
465    /// fn main() {
466    /// 	let mut profile =  Profile::new();
467    ///
468    ///    	profile.analyze("Smith, John");
469    ///    	profile.analyze("O'Brian, Henny");
470    ///    	profile.analyze("Dale, Danny");
471    ///    	profile.analyze("Rickets, Ronnae");
472    ///    	profile.analyze("Richard, Richie");
473    ///    	profile.analyze("Roberts, Blake");
474    ///    	profile.analyze("Conways, Sephen");
475    ///
476    ///    	profile.pre_generate();
477    ///    	let test = [("CvccvccpSCvccvv".to_string(), 28.57142857142857 as f64), ("CcvccpSCvcc".to_string(), 42.857142857142854 as f64), ("CvccvccpSCvccvc".to_string(), 57.14285714285714 as f64), ("CvcvcccpSCcvcv".to_string(), 71.42857142857142 as f64), ("CvcvpSCvccc".to_string(), 85.7142857142857 as f64), ("V@CcvvcpSCvccc".to_string(), 99.99999999999997 as f64)];
478    ///
479    ///    	assert_eq!(profile.pattern_ranks, test);
480    /// }
481    /// ```
482    #[inline]
483    pub fn cum_patternmap(&mut self) {
484        // Reference: https://users.rust-lang.org/t/cannot-infer-an-appropriate-lifetime-for-autoref/13360/3
485
486        debug!("calculating the cumulative percentage of occurences for data point patterns...");
487
488        // calculate the percentage by patterns
489        // -> {"CcvccpSCvcc": 14.285714285714285, "CvccvccpSCvccvc": 14.285714285714285, "CvccvccpSCvccvv": 28.57142857142857, "CvcvcccpSCcvcv": 14.285714285714285, "CvcvpSCvccc": 14.285714285714285, "V~CcvvcpSCvccc": 14.285714285714285}
490        let n = self.patterns.len();
491
492        // see issue: https://github.com/dsietz/test-data-generation/issues/88
493        self.pattern_percentages.clear();
494
495        for m in 0..n {
496            self.pattern_percentages.push((
497                self.pattern_keys[m].clone(),
498                (self.pattern_vals[m] as f64 / self.pattern_total as f64) * 100.0,
499            ));
500        }
501
502        // sort the ranks by percentages in decreasing order
503        // -> [("CvccvccpSCvccvv", 28.57142857142857), ("CcvccpSCvcc", 14.285714285714285), ("CvccvccpSCvccvc", 14.285714285714285), ("CvcvcccpSCcvcv", 14.285714285714285), ("CvcvpSCvccc", 14.285714285714285), ("V~CcvvcpSCvccc", 14.285714285714285)]
504        self.pattern_percentages
505            .sort_by(|&(_, a), &(_, b)| b.partial_cmp(&a).unwrap());
506
507        // calculate the cumulative sum of the pattern rankings
508        // -> [("CvccvccpSCvccvv", 28.57142857142857), ("CcvccpSCvcc", 42.857142857142854), ("CvccvccpSCvccvc", 57.14285714285714), ("CvcvcccpSCcvcv", 71.42857142857142), ("CvcvpSCvccc", 85.7142857142857), ("V~CcvvcpSCvccc", 99.99999999999997)]
509        let mut rank: f64 = 0.00;
510
511        // see issue: https://github.com/dsietz/test-data-generation/issues/88
512        self.pattern_ranks.clear();
513
514        for pttrn in self.pattern_percentages.iter() {
515            let tmp = pttrn.1 + rank;
516            self.pattern_ranks.push((pttrn.0.clone(), tmp));
517            rank = tmp;
518        }
519    }
520
521    /// This function calculates the sizes to use by the chance they will occur (as cumulative percentage) in decreasing order
522    ///
523    /// # Example
524    ///
525    /// ```rust
526    /// extern crate test_data_generation;
527    ///
528    /// use test_data_generation::Profile;
529    ///
530    /// fn main() {
531    /// 	let mut profile =  Profile::new();
532    ///		profile.analyze("One");
533    ///		profile.analyze("Two");
534    ///		profile.analyze("Three");
535    ///		profile.analyze("Four");
536    ///		profile.analyze("Five");
537    ///		profile.analyze("Six");
538    ///
539    ///     profile.cum_sizemap();
540    ///
541    ///		print!("The size ranks are {:?}", profile.size_ranks);
542    ///     // The size ranks are [(3, 50), (4, 83.33333333333333), (5, 100)]
543    /// }
544    /// ```
545    #[inline]
546    pub fn cum_sizemap(&mut self) {
547        debug!("calculating the cumulative percentage of occurences for data point sizes...");
548        // calculate the percentage by sizes
549        // -> {11: 28.57142857142857, 14: 14.285714285714285, 15: 57.14285714285714}
550        let mut size_ranks = SizeRankMap::new();
551
552        for key in self.sizes.keys() {
553            size_ranks.insert(
554                *key,
555                (*self.sizes.get(key).unwrap() as f64 / self.size_total as f64) * 100.0,
556            );
557        }
558
559        // sort the ranks by percentages in decreasing order
560        // -> [(15, 57.14285714285714), (11, 28.57142857142857), (14, 14.285714285714285)]
561        let mut sizes = size_ranks.iter().collect::<Vec<_>>();
562        sizes.sort_by(|&(_, a), &(_, b)| b.partial_cmp(a).unwrap());
563
564        // calculate the cumulative sum of the size rankings
565        // -> [(15, 57.14285714285714), (11, 85.71428571428571), (14, 100)]
566        self.size_ranks = sizes
567            .iter()
568            .scan((0_u32, 0.00_f64), |state, &(&k, &v)| {
569                *state = (k, state.1 + &v);
570                Some(*state)
571            })
572            .collect::<Vec<(_, _)>>();
573    }
574
575    /// This function generates realistic test data based on the sampel data that was analyzed.
576    ///
577    /// # Example
578    ///
579    /// ```rust
580    /// extern crate test_data_generation;
581    ///
582    /// use test_data_generation::Profile;
583    ///
584    /// fn main() {
585    /// 	let mut profile =  Profile::new();
586    ///
587    ///		profile.analyze("One");
588    ///		profile.analyze("Two");
589    ///		profile.analyze("Three");
590    ///		profile.analyze("Four");
591    ///		profile.analyze("Five");
592    ///
593    ///     profile.pre_generate();
594    ///
595    ///		print!("The test data {:?} was generated.", profile.generate());
596    /// }
597    /// ```
598    #[inline]
599    pub fn generate(&mut self) -> String {
600        // 1. get a random number
601        let s: f64 = random_percentage!();
602
603        // 2. find the first pattern that falls within the percentage chance of occurring
604        // NOTE: The following 2 lines has been commented out because this doesn't need to
605        //       happen since the patterns are already ranks by percent chance of occurring
606        //       and therefore sizes (lengths) as well since the patterns include the full
607        //       length of the entitiy analyzed.
608        //let size = self.size_ranks.iter().find(|&&x|&x.1 >= &s).unwrap().0;
609        //let pattern = self.pattern_ranks.iter().find(|x|&x.1 >= &s && x.0.len() == size as usize).unwrap().clone();
610        let pattern = self
611            .pattern_ranks
612            .iter()
613            .find(|x| &x.1 >= &s)
614            .unwrap()
615            .clone();
616
617        // lastly, generate the test data using facts that adhere to the pattern
618        self.generate_from_pattern(pattern.0)
619    }
620
621    /// This function generates realistic test data based on the sample data that was analyzed.
622    ///
623    /// # Arguments
624    ///
625    /// * `pattern: String` - The pattern to reference when generating the test data.</br>
626    ///
627    /// # Example
628    ///
629    /// ```rust
630    /// extern crate test_data_generation;
631    ///
632    /// use test_data_generation::Profile;
633    ///
634    /// fn main() {
635    /// 	let mut profile =  Profile::new();
636    ///
637    ///		profile.analyze("01/13/2017");
638    ///		profile.analyze("11/24/2017");
639    ///		profile.analyze("08/05/2017");
640    ///
641    ///     profile.pre_generate();
642    ///
643    ///  	let generated = profile.generate_from_pattern("##p##p####".to_string());
644    ///
645    ///     assert_eq!(generated.len(), 10);
646    /// }
647    /// ```
648    #[inline]
649    pub fn generate_from_pattern(&self, pattern: String) -> String {
650        let pattern_chars = pattern.chars().collect::<Vec<char>>();
651        let mut generated = String::new();
652        let prev_char = ' ';
653
654        // iterate through the chars in the pattern string
655        for (idx, ch) in pattern_chars.iter().enumerate() {
656            match crossbeam::scope(|scope| {
657                let c = ch;
658                let starts = if idx == 0 { 1 } else { 0 };
659                let ends = if idx == pattern_chars.len() - 1 { 1 } else { 0 };
660                let mut fact_options = vec![];
661                let prior_char = prev_char;
662
663                // iterate through the processors (vec) that hold the lists (vec) of facts
664                for v in &self.facts {
665                    let selected_facts = scope.spawn(move |_| {
666                        let mut facts = vec![];
667
668                        // iterate through the list of facts
669                        for value in v {
670                            if value.starts_with == starts
671                                && value.ends_with == ends
672                                && value.pattern_placeholder == *c
673                                && value.index_offset == idx as u32
674                            {
675                                facts.push(value.key);
676
677                                // if the value.key's prior char matches the prior generated char, then weight the value.key
678                                // to increase the chance of it being used when generated
679                                if value.prior_key.unwrap_or(' ') == prior_char {
680                                    facts.push(value.key);
681                                    facts.push(value.key);
682                                }
683
684                                // if the value.key's index_offset matches the current index, then weight the value.key
685                                // to increase the chance of it being used when generated
686                                if value.index_offset == idx as u32 {
687                                    facts.push(value.key);
688                                    facts.push(value.key);
689                                }
690                            }
691                        }
692
693                        facts
694                    });
695
696                    //append the selected_facts to the fact_options
697                    //fact_options.extend_from_slice(&selected_facts.join());
698                    match selected_facts.join() {
699                        Ok(sf) => fact_options.extend_from_slice(&sf),
700                        Err(err) => {
701                            error!("{:?}", err);
702                            panic!("{:?}", err);
703                        }
704                    }
705                }
706
707                //select a fact to use as the generated char
708                let rnd_start = 0;
709                let rnd_end = fact_options.len() - 1;
710
711                if rnd_start >= rnd_end {
712                    //generated.push(fact_options[0 as usize]);
713                    fact_options[0_usize]
714                } else {
715                    let x: u32 = random_between!(rnd_start, rnd_end);
716                    //prev_char = fact_options[x as usize];
717                    //generated.push(prev_char);
718                    fact_options[x as usize]
719                }
720            }) {
721                Ok(c) => generated.push(c),
722                Err(err) => {
723                    error!("{:?}", err);
724                    panic!("{:?}", err);
725                }
726            }
727        }
728
729        generated
730    }
731
732    /// This function learns by measuring how realistic the test data it generates to the sample data that was provided.
733    ///
734    /// # Arguments
735    ///
736    /// * `control_list: Vec<String>` - The list of strings to compare against. This would be the real data from the data sample.</br>
737    ///
738    /// # Example
739    ///
740    /// ```rust
741    /// extern crate test_data_generation;
742    ///
743    /// use test_data_generation::Profile;
744    ///
745    /// fn main() {
746    /// 	let mut profil =  Profile::new();
747    /// 	let sample_data = vec!("Smith, John".to_string(),"Doe, John".to_string(),"Dale, Danny".to_string(),"Rickets, Ronney".to_string());
748    ///
749    /// 	for sample in sample_data.iter().clone() {
750    /// 		profil.analyze(&sample);
751    /// 	}
752    ///
753    /// 	// in order to learn the profile must be prepared with pre_genrate()
754    ///		// so it can generate data to learn from
755    ///		profil.pre_generate();
756    ///
757    /// 	let learning = profil.learn_from_entity(sample_data).unwrap();
758    ///
759    /// 	assert_eq!(learning, true);
760    /// }
761    /// ```
762    pub fn learn_from_entity(&mut self, control_list: Vec<String>) -> Result<bool, String> {
763        for _n in 0..10 {
764            let experiment = self.generate();
765            let mut percent_similarity: Vec<f64> = Vec::new();
766
767            for control in control_list.iter().clone() {
768                debug!("Comparing {} with {} ...", &control, &experiment);
769                percent_similarity.push(self.realistic_test(&control, &experiment));
770            }
771
772            let percent =
773                percent_similarity.iter().sum::<f64>() as f64 / percent_similarity.len() as f64;
774            debug!("Percent similarity is {} ...", &percent);
775
776            if percent >= 80_f64 {
777                self.analyze(&experiment);
778            }
779        }
780
781        Ok(true)
782    }
783
784    /// This function calculates the levenshtein distance between 2 strings.
785    /// See: https://crates.io/crates/levenshtein
786    ///
787    /// # Arguments
788    ///
789    /// * `control: &String` - The string to compare against. This would be the real data from the data sample.</br>
790    /// * `experiment: &String` - The string to compare. This would be the generated data for which you want to find the distance.</br>
791    ///
792    /// #Example
793    ///
794    /// ```rust
795    /// extern crate test_data_generation;
796    ///
797    /// use test_data_generation::Profile;
798    ///
799    /// fn main() {
800    ///		let mut profile =  Profile::new();
801    ///
802    ///     assert_eq!(profile.levenshtein_distance(&"kitten".to_string(), &"sitting".to_string()), 3 as usize);
803    /// }
804    ///
805    pub fn levenshtein_distance(&mut self, control: &String, experiment: &String) -> usize {
806        // https://docs.rs/levenshtein/1.0.3/levenshtein/fn.levenshtein.html
807        levenshtein_distance!(control, experiment)
808    }
809
810    /// This function calculates the percent difference between 2 strings.
811    ///
812    /// # Arguments
813    ///
814    /// * `control: &str` - The string to compare against. This would be the real data from the data sample.</br>
815    /// * `experiment: &str` - The string to compare. This would be the generated data for which you want to find the percent difference.</br>
816    ///
817    /// #Example
818    ///
819    /// ```rust
820    /// extern crate test_data_generation;
821    ///
822    /// use test_data_generation::Profile;
823    ///
824    /// fn main() {
825    ///		let mut profile =  Profile::new();
826    ///
827    ///     assert_eq!(profile.realistic_test(&"kitten".to_string(), &"sitting".to_string()), 76.92307692307692 as f64);
828    /// }
829    ///
830    #[inline]
831    pub fn realistic_test(&mut self, control: &str, experiment: &str) -> f64 {
832        realistic_test!(control, experiment)
833    }
834
835    /// This function is called from within the implementated structure and returns a list processors (Vec) with empty lists (Vec) for their Facts.
836    /// Each processor shares the load of generating the data based on the Facts it has been assigned to manage.
837    ///
838    /// # Arguments
839    ///
840    /// * `p: u8` - A number that sets the number of processors to start up to manage the Facts.</br>
841    ///         Increasing the number of processors will speed up the generator be ditributing the workload.
842    ///         The recommended number of processors is 1 per 10K data points (e.g.: profiling 20K names should be handled by 2 processors)</br>
843    ///         NOTE: The default number of processors is 4.
844    ///
845    #[inline]
846    fn new_facts(p: u8) -> Vec<Vec<Fact>> {
847        let mut vec_main = Vec::new();
848
849        for _ in 0..p {
850            vec_main.push(Vec::new());
851        }
852
853        vec_main
854    }
855
856    /// This function prepares the size a pattern accumulated percentages order by percentage increasing
857    ///
858    /// # Example
859    ///
860    /// ```rust
861    /// extern crate test_data_generation;
862    ///
863    /// use test_data_generation::Profile;
864    ///
865    /// fn main() {
866    /// 	let mut profile =  Profile::new();
867    ///		profile.analyze("One");
868    ///		profile.analyze("Two");
869    ///		profile.analyze("Three");
870    ///		profile.analyze("Four");
871    ///		profile.analyze("Five");
872    ///		profile.analyze("Six");
873    ///
874    ///     profile.pre_generate();
875    ///
876    ///		print!("The size ranks are {:?}", profile.size_ranks);
877    ///     // The size ranks are [(3, 50), (4, 83.33333333333333), (5, 100)]
878    /// }
879    /// ```
880    pub fn pre_generate(&mut self) {
881        info!("Preparing the profile for data generation...");
882        self.cum_sizemap();
883        self.cum_patternmap();
884        info!("Profile: preparing generator...");
885    }
886
887    /// This function resets the patterns that the Profile has analyzed.
888    /// Call this method whenever you wish to "clear" the Profile
889    ///
890    /// # Example
891    ///
892    /// ```rust
893    /// extern crate test_data_generation;
894    ///
895    /// use test_data_generation::Profile;
896    ///
897    /// fn main() {
898    /// 	let mut profile =  Profile::new();
899    ///
900    ///		profile.analyze("One");
901    ///		profile.analyze("Two");
902    ///		profile.analyze("Three");
903    ///
904    ///     let x = profile.patterns.len();
905    ///
906    ///     profile.reset_analyze();
907    ///
908    ///		profile.analyze("Four");
909    ///		profile.analyze("Five");
910    ///		profile.analyze("Six");
911    ///		profile.analyze("Seven");
912    ///		profile.analyze("Eight");
913    ///		profile.analyze("Nine");
914    ///		profile.analyze("Ten");
915    ///
916    ///     let y = profile.patterns.len();
917    ///
918    ///     assert_eq!(x, 3);
919    ///     assert_eq!(y, 5);
920    /// }
921    /// ```
922    pub fn reset_analyze(&mut self) {
923        info!("Resetting the profile ...");
924        self.patterns = PatternMap::new();
925        info!("Profile: patterns have been reset ...");
926    }
927
928    /// This function saves (exports) the Profile to a JSON file.
929    /// This is useful when you wish to reuse the algorithm to generate more test data later.
930    ///
931    /// # Arguments
932    ///
933    /// * `field: String` - The full path of the export file , excluding the file extension, (e.g.: "./test/data/custom-names").</br>
934    ///
935    /// #Errors
936    /// If this function encounters any form of I/O or other error, an error variant will be returned.
937    /// Otherwise, the function returns Ok(true).</br>
938    ///
939    /// #Example
940    ///
941    /// ```rust
942    /// extern crate test_data_generation;
943    ///
944    /// use test_data_generation::Profile;
945    ///
946    /// fn main() {
947    /// 	// analyze the dataset
948    ///		let mut profile =  Profile::new();
949    ///     profile.analyze("Smith, John");
950    ///		profile.analyze("O'Brian, Henny");
951    ///		profile.analyze("Dale, Danny");
952    ///		profile.analyze("Rickets, Ronney");
953    ///
954    ///		profile.pre_generate();
955    ///
956    ///     assert_eq!(profile.save("./tests/samples/sample-00-profile").unwrap(), true);
957    /// }
958    ///
959    pub fn save(&mut self, path: &'static str) -> Result<bool, io::Error> {
960        let dsp_json = serde_json::to_string(&self).unwrap();
961
962        // Create the archive file
963        let mut file = match File::create(format!("{}.json", &path)) {
964            Err(e) => {
965                error!("Could not create file {:?}", &path.to_string());
966                return Err(e);
967            }
968            Ok(f) => {
969                info!("Successfully exported to {:?}", &path.to_string());
970                f
971            }
972        };
973
974        // Write the json string to file, returns io::Result<()>
975        match file.write_all(dsp_json.as_bytes()) {
976            Err(e) => {
977                error!("Could not write to file {}", &path.to_string());
978                return Err(e);
979            }
980            Ok(_) => {
981                info!("Successfully exported to {}", &path.to_string());
982            }
983        };
984
985        Ok(true)
986    }
987
988    /// This function converts the Profile to a serialize JSON string.
989    ///
990    /// #Example
991    ///
992    /// ```rust
993    /// extern crate test_data_generation;
994    ///
995    /// use test_data_generation::Profile;
996    ///
997    /// fn main() {
998    /// 	// analyze the dataset
999    ///		let mut data_profile =  Profile::new();
1000    ///
1001    ///     // analyze the dataset
1002    ///		data_profile.analyze("OK");
1003    ///
1004    ///     println!("{}", data_profile.serialize());
1005    ///     // {"patterns":{"VC":1},"pattern_total":1,"pattern_keys":["VC"],"pattern_vals":[1],"pattern_percentages":[],"pattern_ranks":[],"sizes":{"2":1},"size_total":1,"size_ranks":[],"processors":4,"facts":[[{"key":"O","prior_key":null,"next_key":"K","pattern_placeholder":"V","starts_with":1,"ends_with":0,"index_offset":0}],[{"key":"K","prior_key":"O","next_key":null,"pattern_placeholder":"C","starts_with":0,"ends_with":1,"index_offset":1}],[],[]]}
1006    /// }
1007    ///
1008    pub fn serialize(&mut self) -> String {
1009        serde_json::to_string(&self).unwrap()
1010    }
1011}
1012
1013#[macro_use]
1014pub mod macros;
1015pub mod configs;
1016pub mod data_sample_parser;
1017pub mod engine;
1018pub mod shared;
1019
1020// Unit Tests
1021#[cfg(test)]
1022mod tests {
1023    use super::*;
1024
1025    #[test]
1026    fn apply_facts() {
1027        let mut profile = Profile::new();
1028        let results = PatternDefinition::new().analyze("Word");
1029
1030        assert_eq!(profile.apply_facts(results.0, results.1).unwrap(), 1);
1031    }
1032
1033    #[test]
1034    fn levenshtein_test() {
1035        let mut profil = Profile::new();
1036
1037        assert_eq!(
1038            profil.levenshtein_distance(&"kitten".to_string(), &"sitting".to_string()),
1039            3 as usize
1040        );
1041    }
1042
1043    #[test]
1044    fn realistic_data_test() {
1045        let mut profil = Profile::new();
1046
1047        assert_eq!(
1048            profil.realistic_test(&"kitten".to_string(), &"sitting".to_string()),
1049            76.92307692307692 as f64
1050        );
1051    }
1052
1053    #[test]
1054    fn learn_from_entity() {
1055        let mut profil = Profile::new();
1056        let sample_data = vec![
1057            "Smith, John".to_string(),
1058            "Doe, John".to_string(),
1059            "Dale, Danny".to_string(),
1060            "Rickets, Ronney".to_string(),
1061        ];
1062
1063        for sample in sample_data.iter().clone() {
1064            profil.analyze(&sample);
1065        }
1066
1067        profil.pre_generate();
1068
1069        let learning = profil.learn_from_entity(sample_data).unwrap();
1070
1071        assert_eq!(learning, true);
1072    }
1073
1074    #[test]
1075    fn logging_test() {
1076        let mut profile = Profile::new();
1077        profile.reset_analyze();
1078
1079        assert!(true);
1080    }
1081
1082    #[test]
1083    fn new_profile_with_id() {
1084        let mut profile = Profile::new_with_id("12345".to_string());
1085        profile.pre_generate();
1086
1087        assert_eq!(profile.id.unwrap(), "12345".to_string());
1088    }
1089
1090    #[test]
1091    fn new_profile_from_file() {
1092        let mut profile = Profile::from_file("./tests/samples/sample-00-profile");
1093        profile.pre_generate();
1094
1095        assert!(profile.generate().len() > 0);
1096    }
1097
1098    #[test]
1099    #[should_panic]
1100    fn new_profile_from_file_bad_data() {
1101        let mut profile = Profile::from_file("./tests/samples/not-readable");
1102        profile.pre_generate();
1103
1104        assert!(profile.generate().len() > 0);
1105    }
1106
1107    #[test]
1108    #[should_panic(expected = "Could not open file \"./tests/samples/bad-path\"")]
1109    fn new_profile_from_file_bad_path() {
1110        let mut profile = Profile::from_file("./tests/samples/bad-path");
1111        profile.pre_generate();
1112
1113        assert!(profile.generate().len() > 0);
1114    }
1115
1116    #[test]
1117    fn new_profile_from_serialized() {
1118        let serialized = "{\"patterns\":{\"VC\":1},\"pattern_total\":1,\"pattern_keys\":[\"VC\"],\"pattern_vals\":[1],\"pattern_percentages\":[],\"pattern_ranks\":[],\"sizes\":{\"2\":1},\"size_total\":1,\"size_ranks\":[],\"processors\":4,\"facts\":[[{\"key\":\"O\",\"prior_key\":null,\"next_key\":\"K\",\"pattern_placeholder\":\"V\",\"starts_with\":1,\"ends_with\":0,\"index_offset\":0}],[{\"key\":\"K\",\"prior_key\":\"O\",\"next_key\":null,\"pattern_placeholder\":\"C\",\"starts_with\":0,\"ends_with\":1,\"index_offset\":1}],[],[]]}";
1119        let mut profile = Profile::from_serialized(&serialized);
1120        profile.pre_generate();
1121
1122        assert_eq!(profile.generate(), "OK");
1123    }
1124
1125    #[test]
1126    fn new_profile_new_with() {
1127        let profile = Profile::new_with_processors(10);
1128
1129        assert_eq!(profile.processors, 10);
1130    }
1131
1132    #[test]
1133    // ensure Profile is analyzing all the sample data points
1134    fn profile_analyze() {
1135        let mut profil = Profile::new();
1136        profil.analyze("Smith, John");
1137        profil.analyze("O'Brian, Henny");
1138        profil.analyze("Dale, Danny");
1139        profil.analyze("Rickets, Ronney");
1140
1141        assert_eq!(profil.patterns.len(), 4);
1142    }
1143
1144    #[test]
1145    // ensure Profile is able to find the facts that relate to a pattern
1146    // NOTE: Dates need work! e.g.: 00/15/0027
1147    fn profile_generate_from_pattern_date() {
1148        let mut profil = Profile::new();
1149        profil.analyze("01/13/2017");
1150        profil.analyze("11/24/2017");
1151        profil.analyze("08/05/2017");
1152
1153        profil.pre_generate();
1154        let generated = profil.generate_from_pattern("##p##p####".to_string());
1155
1156        assert_eq!(10, generated.len());
1157    }
1158
1159    #[test]
1160    // ensure Profile is able to find the facts that relate to a pattern
1161    fn profile_generate_from_pattern_string() {
1162        let mut profil = Profile::new();
1163        profil.analyze("First");
1164        profil.analyze("Next");
1165        profil.analyze("Last");
1166
1167        profil.pre_generate();
1168        let generated = profil.generate_from_pattern("Cvcc".to_string());
1169
1170        assert_eq!(4, generated.len());
1171    }
1172
1173    #[test]
1174    // ensure Profile is generating correct test data
1175    fn profile_generate() {
1176        let mut profil = Profile::new();
1177        profil.analyze("Smith, John");
1178        profil.analyze("O'Brian, Henny");
1179        profil.analyze("Dale, Danny");
1180        profil.analyze("Rickets, Ronnae");
1181        profil.analyze("Richard, Richie");
1182        profil.analyze("Roberts, Blake");
1183        profil.analyze("Conways, Sephen");
1184
1185        profil.pre_generate();
1186
1187        assert!(profil.generate().len() > 10);
1188    }
1189
1190    #[test]
1191    // issue #31
1192    // ensure Profile doesn't generate a name with a backslash preceding an apostrophe
1193    fn profile_generate_with_apostrophe() {
1194        let mut profil = Profile::new();
1195        profil.analyze("O'Brien");
1196
1197        profil.pre_generate();
1198        let generated = profil.generate();
1199
1200        assert_eq!(generated, "O'Brien");
1201    }
1202
1203    #[test]
1204    // ensure Profile is providing the correct pattern ranks after analyzing the sample data
1205    fn profile_pregenerate_patterns() {
1206        let mut profil = Profile::new();
1207        profil.analyze("Smith, John");
1208        profil.analyze("O'Brian, Henny");
1209        profil.analyze("Dale, Danny");
1210        profil.analyze("Rickets, Ronnae");
1211        profil.analyze("Richard, Richie");
1212        profil.analyze("Roberts, Blake");
1213        profil.analyze("Conways, Sephen");
1214
1215        profil.pre_generate();
1216        let test = [
1217            ("CvccvccpSCvccvv".to_string(), 28.57142857142857 as f64),
1218            ("CcvccpSCvcc".to_string(), 42.857142857142854 as f64),
1219            ("CvccvccpSCvccvc".to_string(), 57.14285714285714 as f64),
1220            ("CvcvcccpSCcvcv".to_string(), 71.42857142857142 as f64),
1221            ("CvcvpSCvccc".to_string(), 85.7142857142857 as f64),
1222            ("V@CcvvcpSCvccc".to_string(), 99.99999999999997 as f64),
1223        ];
1224
1225        assert_eq!(profil.pattern_ranks, test);
1226    }
1227
1228    #[test]
1229    // ensure Profile is providing the correct pattern ranks after analyzing the sample data
1230    fn profile_pregenerate_sizes() {
1231        let mut profil = Profile::new();
1232
1233        profil.analyze("Smith, Johny"); //12
1234        profil.analyze("O'Brian, Hen"); //12
1235        profil.analyze("Dale, Danny"); //11
1236        profil.analyze("O'Henry, Al"); //11
1237        profil.analyze("Rickets, Ro"); //11
1238        profil.analyze("Mr. Wilbers"); //11
1239        profil.analyze("Po, Al"); //6
1240
1241        profil.pre_generate();
1242        let test = [
1243            (11, 57.14285714285714),
1244            (12, 85.71428571428571),
1245            (6, 100 as f64),
1246        ];
1247
1248        assert_eq!(profil.size_ranks, test);
1249    }
1250
1251    #[test]
1252    fn save_profile() {
1253        let mut profile = Profile::new();
1254        profile.analyze("Smith, John");
1255        profile.analyze("O'Brian, Henny");
1256        profile.analyze("Dale, Danny");
1257        profile.analyze("Rickets, Ronney");
1258
1259        profile.pre_generate();
1260
1261        assert_eq!(
1262            profile.save("./tests/samples/sample-00-profile").unwrap(),
1263            true
1264        );
1265    }
1266
1267    #[test]
1268    // ensure a Profile can be exported (to be archived) as JSON
1269    fn serialize() {
1270        let mut profil = Profile::new();
1271
1272        // analyze the dataset
1273        profil.analyze("OK");
1274
1275        let serialized = profil.serialize();
1276        assert_eq!(serialized, "{\"id\":null,\"patterns\":{\"VC\":1},\"pattern_total\":1,\"pattern_keys\":[\"VC\"],\"pattern_vals\":[1],\"pattern_percentages\":[],\"pattern_ranks\":[],\"sizes\":{\"2\":1},\"size_total\":1,\"size_ranks\":[],\"processors\":4,\"facts\":[[{\"key\":\"O\",\"prior_key\":null,\"next_key\":\"K\",\"pattern_placeholder\":\"V\",\"starts_with\":1,\"ends_with\":0,\"index_offset\":0}],[{\"key\":\"K\",\"prior_key\":\"O\",\"next_key\":null,\"pattern_placeholder\":\"C\",\"starts_with\":0,\"ends_with\":1,\"index_offset\":1}],[],[]]}");
1277    }
1278}
test_data_generation/lib.rs

test_data_generation/
lib.rs