test-data-generation 0.3.4

// Copyright 2018 David Sietz and [`test-data-generator` contributors](https://github.com/dsietz/test-data-generator/blob/master/CONTRIBUTORS.md).
// Licensed under the MIT license
// (see LICENSE or <https://opensource.org/licenses/Apache-2.0>)
//
//!
//! The are multiple ways to use the Test Data Generation library. It all depends on your intent.
//!
//! ### Profile
//!
//! The easiest way is to use a Profile. The `profile` module provides functionality to create a profile on a data sample (Strings).
//! Once a profile has been made, data can be generated by calling the _pre_generate()_ and _generate()_ functions, in that order.
//!
//! ```
//! extern crate test_data_generation;
//!
//! use test_data_generation::Profile;
//!
//! fn main() {
//!     // analyze the dataset
//! 	let mut data_profile =  Profile::new();
//!
//!     // analyze the dataset
//! 	data_profile.analyze("Smith, John");
//! 	data_profile.analyze("Doe, John");
//! 	data_profile.analyze("Dale, Danny");
//! 	data_profile.analyze("Rickets, Ronney");
//!
//!     // confirm 4 data samples were analyzed
//!    	assert_eq!(data_profile.patterns.len(), 4);
//!
//!     // prepare the generator
//!     data_profile.pre_generate();
//!
//!     // generate some data
//!    	println!("The generated name is {:?}", data_profile.generate());
//! }
//! ```
//!
//! You can also export (archive as JSON file) the profile for later use.
//! This allows for the algorithm to be retrieved without having to store the actual data that was analyzed.
//!
//!	```
//! extern crate test_data_generation;
//!
//! use test_data_generation::Profile;
//!
//! fn main() {
//!		//create a profile and analyze some data
//!		let mut old_profile =  Profile::new();
//!		old_profile.analyze("Smith, John");
//!		old_profile.analyze("O'Brian, Henny");
//!		old_profile.analyze("Dale, Danny");
//!		old_profile.analyze("Rickets, Ronney");
//!
//!		old_profile.pre_generate();
//!
//!		//save the profile for later
//!		assert_eq!(old_profile.save("./tests/samples/sample-00-profile").unwrap(), true);
//!
//!		// create a new profile from the archive json file
//!		let mut new_profile = Profile::from_file("./tests/samples/sample-00-profile");
//!
//!		// generate some data. NOTE that the pre-generate() was already called prior to saving
//!     println!("The generated name is {:?}", new_profile.generate());
//! }
//! ```
//!
//! ### Data Sample Parser
//!
//! If you are using CSV files of data samples, then you may wish to use a Data Sample Parser.
//! The `data_sample_parser` module provides functionality to read sample data, parse and analyze it, so that test data can be generated based on profiles.
//!
//! ```
//! extern crate test_data_generation;
//! use test_data_generation::data_sample_parser::DataSampleParser;
//!
//! fn main() {
//!     let mut dsp = DataSampleParser::new();
//!     dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None).unwrap();
//!
//!     println!("My new name is {} {}", dsp.generate_record()[0], dsp.generate_record()[1]);
//!     // My new name is Abbon Aady
//! }
//! ```
//!
//! You can also save the Data Sample Parser (the algorithm) as an archive file (json) ...
//!
//! ```
//! extern crate test_data_generation;
//! use test_data_generation::data_sample_parser::DataSampleParser;
//!
//! fn main() {
//!     let mut dsp =  DataSampleParser::new();
//!     dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None).unwrap();
//!
//!     assert_eq!(dsp.save(&String::from("./tests/samples/sample-01-dsp")).unwrap(), true);
//! }
//! ```
//!
//! and use it at a later time.
//!
//! ```
//! extern crate test_data_generation;
//! use test_data_generation::data_sample_parser::DataSampleParser;
//!
//! fn main() {
//!     let mut dsp = DataSampleParser::from_file(&String::from("./tests/samples/sample-01-dsp"));
//!
//! 	println!("Sample data is {:?}", dsp.generate_record()[0]);
//! }
//! ```
//!
//! You can also generate a new csv file based on the data sample provided.
//!
//! ```
//! extern crate test_data_generation;
//!
//! use test_data_generation::data_sample_parser::DataSampleParser;
//!
//! fn main() {
//!     let mut dsp =  DataSampleParser::new();
//!
//!     // Use the default delimiter (comma)
//!    	dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None).unwrap();
//!    	dsp.generate_csv(100, &String::from("./tests/samples/generated-01.csv"), None).unwrap();
//! }
//! ```
#![crate_type = "lib"]
#![crate_name = "test_data_generation"]

#[macro_use]
extern crate log;

#[macro_use]
extern crate serde_derive;
extern crate crossbeam;
extern crate csv;
extern crate indexmap;
extern crate levenshtein;
extern crate rand;
extern crate regex;
extern crate serde;
extern crate serde_json;
extern crate serde_yaml;
extern crate yaml_rust;

use crate::engine::{Fact, PatternDefinition};
use std::collections::BTreeMap;
use std::fs::File;
use std::io;
use std::io::prelude::*;
use std::io::Write;
use std::ops::AddAssign;

type PatternMap = BTreeMap<String, u32>;
type SizeMap = BTreeMap<u32, u32>;
type SizeRankMap = BTreeMap<u32, f64>;

#[derive(Clone, Serialize, Deserialize, Debug)]
/// Represents a Profile for sample data that has been analyzed and can be used to generate realistic data
pub struct Profile {
    /// An identifier (not necessarily unique) that is used to differentiate profiles from one another
    pub id: Option<String>,
    /// A list of symbolic patterns with a distinct count of occurrences
    pub patterns: PatternMap,
    /// The total number of patterns in the profile
    pub pattern_total: u32,
    /// A list of symbolic patterns in the profile
    /// (used for temporary storage due to lifetime issues)
    pub pattern_keys: Vec<String>,
    /// A list of distinct counts for patterns in the profile
    /// (used for temporary storage due to lifetime issues)
    pub pattern_vals: Vec<u32>,
    /// A list of symbolic patterns with their percent chance of occurrence
    pub pattern_percentages: Vec<(String, f64)>,
    /// A list of symbolic patterns with a running total of percent chance of occurrence, in increasing order
    pub pattern_ranks: Vec<(String, f64)>,
    /// A list of pattern lengths with a distinct count of occurrence
    pub sizes: SizeMap,
    /// the total number of pattern sizes (lengths) in the profile
    pub size_total: u32,
    /// A list of pattern sizes (lengths) with a running total of their percent chance of occurrence, in increasing order
    pub size_ranks: Vec<(u32, f64)>,
    /// The number of processors used to distribute the work load (multi-thread) while finding Facts to generate data
    pub processors: u8,
    /// A list of processors (which are lists of Facts) that store all the Facts in the profile
    pub facts: Vec<Vec<Fact>>,
}

impl Profile {
    /// Constructs a new Profile
    ///
    /// #Example
    ///
    /// ```rust
    /// extern crate test_data_generation;
    ///
    /// use test_data_generation::Profile;
    ///
    /// fn main() {
    /// 	let placeholder = Profile::new();
    /// }
    /// ```
    pub fn new() -> Profile {
        Profile {
            id: None,
            patterns: PatternMap::new(),
            pattern_total: 0,
            pattern_keys: Vec::new(),
            pattern_vals: Vec::new(),
            pattern_percentages: Vec::new(),
            pattern_ranks: Vec::new(),
            sizes: SizeMap::new(),
            size_total: 0,
            size_ranks: Vec::new(),
            processors: 4,
            facts: Profile::new_facts(4),
        }
    }

    /// Constructs a new Profile using an identifier
    ///
    /// #Example
    ///
    /// ```rust
    /// extern crate test_data_generation;
    ///
    /// use test_data_generation::Profile;
    ///
    /// fn main() {
    /// 	let placeholder = Profile::new_with_id("12345".to_string());
    /// }
    /// ```
    pub fn new_with_id(id: String) -> Profile {
        Profile {
            id: Some(id),
            patterns: PatternMap::new(),
            pattern_total: 0,
            pattern_keys: Vec::new(),
            pattern_vals: Vec::new(),
            pattern_percentages: Vec::new(),
            pattern_ranks: Vec::new(),
            sizes: SizeMap::new(),
            size_total: 0,
            size_ranks: Vec::new(),
            processors: 4,
            facts: Profile::new_facts(4),
        }
    }

    /// Constructs a new Profile with a specified number of processors to analyze the data.
    /// Each processor shares the load of generating the data based on the Facts it has been assigned to manage.
    ///
    /// # Arguments
    ///
    /// * `p: u8` - A number that sets the number of processors to start up to manage the Facts.</br>
    ///         Increasing the number of processors will speed up the generator be distributing the workload.
    ///         The recommended number of processors is 1 per 10K data points (e.g.: profiling 20K names should be handled by 2 processors)</br>
    ///         NOTE: The default number of processors is 4.
    ///
    /// #Example
    ///
    /// ```rust
    /// extern crate test_data_generation;
    ///
    /// use test_data_generation::Profile;
    ///
    /// fn main() {
    ///     let processors: u8 = 10;
    /// 	let placeholder = Profile::new_with_processors(processors);
    /// }
    /// ```
    pub fn new_with_processors(p: u8) -> Profile {
        Profile {
            id: None,
            patterns: PatternMap::new(),
            pattern_total: 0,
            pattern_keys: Vec::new(),
            pattern_vals: Vec::new(),
            pattern_percentages: Vec::new(),
            pattern_ranks: Vec::new(),
            sizes: SizeMap::new(),
            size_total: 0,
            size_ranks: Vec::new(),
            processors: p,
            facts: Profile::new_facts(p),
        }
    }

    /// Constructs a new Profile from an exported JSON file. This is used when restoring from "archive"
    ///
    /// # Arguments
    ///
    /// * `path: &str` - The full path of the export file , excluding the file extension, (e.g.: "./test/data/custom-names").</br>
    ///
    /// #Example
    ///
    /// ```rust
    /// extern crate test_data_generation;
    ///
    /// use test_data_generation::Profile;
    ///
    /// fn main() {
    ///		let mut profile = Profile::from_file("./tests/samples/sample-00-profile");
    ///
    ///     profile.pre_generate();
    ///
    ///     println!("The generated name is {:?}", profile.generate());
    /// }
    /// ```
    pub fn from_file(path: &'static str) -> Profile {
        // open the archive file
        let mut file = match File::open(format!("{}.json", &path)) {
            Err(_e) => {
                error!("Could not open file {:?}", &path.to_string());
                panic!("Could not open file {:?}", &path.to_string());
            }
            Ok(f) => {
                info!("Successfully opened file {:?}", &path.to_string());
                f
            }
        };

        //read the archive file
        let mut serialized = String::new();
        match file.read_to_string(&mut serialized) {
            Err(e) => {
                error!(
                    "Could not read file {:?} because of {:?}",
                    &path.to_string(),
                    e.to_string()
                );
                panic!(
                    "Could not read file {:?} because of {:?}",
                    &path.to_string(),
                    e.to_string()
                );
            }
            Ok(s) => {
                info!("Successfully read file {:?}", &path.to_string());
                s
            }
        };

        //serde_json::from_str(&serialized).unwrap()
        Self::from_serialized(&serialized)
    }

    /// Constructs a new Profile from a serialized (JSON) string of the Profile object. This is used when restoring from "archive"
    ///
    /// #Example
    ///
    /// ```rust
    /// extern crate test_data_generation;
    ///
    /// use test_data_generation::Profile;
    ///
    /// fn main() {
    ///		let serialized = "{\"patterns\":{\"VC\":1},\"pattern_total\":1,\"pattern_keys\":[\"VC\"],\"pattern_vals\":[1],\"pattern_percentages\":[],\"pattern_ranks\":[],\"sizes\":{\"2\":1},\"size_total\":1,\"size_ranks\":[],\"processors\":4,\"facts\":[[{\"key\":\"O\",\"prior_key\":null,\"next_key\":\"K\",\"pattern_placeholder\":\"V\",\"starts_with\":1,\"ends_with\":0,\"index_offset\":0}],[{\"key\":\"K\",\"prior_key\":\"O\",\"next_key\":null,\"pattern_placeholder\":\"C\",\"starts_with\":0,\"ends_with\":1,\"index_offset\":1}],[],[]]}";
    ///		let mut profile = Profile::from_serialized(&serialized);
    ///
    ///     profile.pre_generate();
    ///
    ///     println!("The generated name is {:?}", profile.generate());
    /// }
    /// ```
    pub fn from_serialized(serialized: &str) -> Profile {
        serde_json::from_str(&serialized).unwrap()
    }

    /// This function converts an data point (&str) to a pattern and adds it to the profile
    ///
    /// # Arguments
    ///
    /// * `entity: String` - The textual str of the value to analyze.</br>
    ///
    /// # Example
    ///
    /// ```rust
    /// extern crate test_data_generation;
    ///
    /// use test_data_generation::Profile;
    ///
    /// fn main() {
    /// 	let mut profile =  Profile::new();
    ///		profile.analyze("One");
    ///		profile.analyze("Two");
    ///		profile.analyze("Three");
    ///		profile.analyze("Four");
    ///
    ///		assert_eq!(profile.patterns.len(), 4);
    /// }
    /// ```
    pub fn analyze(&mut self, entity: &str) {
        let rslt = PatternDefinition::new().analyze(entity);
        let _t = self.apply_facts(rslt.0, rslt.1).map_err(|e| {
            error!(
                "Warning: Couldn't apply the pattern and facts for the entity {}!",
                entity
            );
            e.to_string()
        });
    }

    /// This function applies the pattern and list of Facts  to the profile
    ///
    /// # Arguments
    ///
    /// * `pattern: String` - The string the represents the pattern of the entity that was analyzed.</br>
    /// * `facts: Vec<Fact>` - A Vector containing the Facts based on the analysis (one for each char in the entity).</br>
    ///
    /// # Example
    ///
    /// ```rust
    /// extern crate test_data_generation;
    ///
    /// use test_data_generation::engine::{Fact, PatternDefinition};
    /// use test_data_generation::Profile;
    ///
    /// fn main() {
    /// 	let mut profile =  Profile::new();
    ///		let results = PatternDefinition::new().analyze("Word");
    ///
    ///		assert_eq!(profile.apply_facts(results.0, results.1).unwrap(), 1);
    /// }
    /// ```
    #[inline]
    pub fn apply_facts(&mut self, pattern: String, facts: Vec<Fact>) -> Result<i32, String> {
        // balance the storing of facts across all the vectors that can be processed in parallel
        let mut i = 0;
        for f in facts.into_iter() {
            if i == self.processors {
                i = 0;
            }

            self.facts[i as usize].push(f);
            i += 1;
        }

        // store the pattern
        AddAssign::add_assign(self.patterns.entry(pattern.to_string()).or_insert(0), 1);

        // store the total number of patterns generated so far
        self.pattern_total = self.patterns.values().sum::<u32>();

        // analyze sizes
        AddAssign::add_assign(self.sizes.entry(pattern.len() as u32).or_insert(0), 1);
        self.size_total = self.sizes.values().sum::<u32>();

        self.pattern_keys = self.patterns.keys().cloned().collect();
        self.pattern_vals = self.patterns.values().cloned().collect();

        Ok(1)
    }

    /// This function calculates the patterns to use by the chance they will occur (as cumulative percentage) in decreasing order
    ///
    /// # Example
    ///
    /// ```rust
    /// extern crate test_data_generation;
    ///
    /// use test_data_generation::Profile;
    ///
    /// fn main() {
    /// 	let mut profile =  Profile::new();
    ///
    ///    	profile.analyze("Smith, John");
    ///    	profile.analyze("O'Brian, Henny");
    ///    	profile.analyze("Dale, Danny");
    ///    	profile.analyze("Rickets, Ronnae");
    ///    	profile.analyze("Richard, Richie");
    ///    	profile.analyze("Roberts, Blake");
    ///    	profile.analyze("Conways, Sephen");
    ///
    ///    	profile.pre_generate();
    ///    	let test = [("CvccvccpSCvccvv".to_string(), 28.57142857142857 as f64), ("CcvccpSCvcc".to_string(), 42.857142857142854 as f64), ("CvccvccpSCvccvc".to_string(), 57.14285714285714 as f64), ("CvcvcccpSCcvcv".to_string(), 71.42857142857142 as f64), ("CvcvpSCvccc".to_string(), 85.7142857142857 as f64), ("V@CcvvcpSCvccc".to_string(), 99.99999999999997 as f64)];
    ///
    ///    	assert_eq!(profile.pattern_ranks, test);
    /// }
    /// ```
    #[inline]
    pub fn cum_patternmap(&mut self) {
        // Reference: https://users.rust-lang.org/t/cannot-infer-an-appropriate-lifetime-for-autoref/13360/3

        debug!("calculating the cumulative percentage of occurences for data point patterns...");

        // calculate the percentage by patterns
        // -> {"CcvccpSCvcc": 14.285714285714285, "CvccvccpSCvccvc": 14.285714285714285, "CvccvccpSCvccvv": 28.57142857142857, "CvcvcccpSCcvcv": 14.285714285714285, "CvcvpSCvccc": 14.285714285714285, "V~CcvvcpSCvccc": 14.285714285714285}
        let n = self.patterns.len();

        // see issue: https://github.com/dsietz/test-data-generation/issues/88
        self.pattern_percentages.clear();

        for m in 0..n {
            self.pattern_percentages.push((
                self.pattern_keys[m].clone(),
                (self.pattern_vals[m] as f64 / self.pattern_total as f64) * 100.0,
            ));
        }

        // sort the ranks by percentages in decreasing order
        // -> [("CvccvccpSCvccvv", 28.57142857142857), ("CcvccpSCvcc", 14.285714285714285), ("CvccvccpSCvccvc", 14.285714285714285), ("CvcvcccpSCcvcv", 14.285714285714285), ("CvcvpSCvccc", 14.285714285714285), ("V~CcvvcpSCvccc", 14.285714285714285)]
        self.pattern_percentages
            .sort_by(|&(_, a), &(_, b)| b.partial_cmp(&a).unwrap());

        // calculate the cumulative sum of the pattern rankings
        // -> [("CvccvccpSCvccvv", 28.57142857142857), ("CcvccpSCvcc", 42.857142857142854), ("CvccvccpSCvccvc", 57.14285714285714), ("CvcvcccpSCcvcv", 71.42857142857142), ("CvcvpSCvccc", 85.7142857142857), ("V~CcvvcpSCvccc", 99.99999999999997)]
        let mut rank: f64 = 0.00;

        // see issue: https://github.com/dsietz/test-data-generation/issues/88
        self.pattern_ranks.clear();

        for pttrn in self.pattern_percentages.iter() {
            let tmp = pttrn.1 + rank;
            self.pattern_ranks.push((pttrn.0.clone(), tmp));
            rank = tmp;
        }
    }

    /// This function calculates the sizes to use by the chance they will occur (as cumulative percentage) in decreasing order
    ///
    /// # Example
    ///
    /// ```rust
    /// extern crate test_data_generation;
    ///
    /// use test_data_generation::Profile;
    ///
    /// fn main() {
    /// 	let mut profile =  Profile::new();
    ///		profile.analyze("One");
    ///		profile.analyze("Two");
    ///		profile.analyze("Three");
    ///		profile.analyze("Four");
    ///		profile.analyze("Five");
    ///		profile.analyze("Six");
    ///
    ///     profile.cum_sizemap();
    ///
    ///		print!("The size ranks are {:?}", profile.size_ranks);
    ///     // The size ranks are [(3, 50), (4, 83.33333333333333), (5, 100)]
    /// }
    /// ```
    #[inline]
    pub fn cum_sizemap(&mut self) {
        debug!("calculating the cumulative percentage of occurences for data point sizes...");
        // calculate the percentage by sizes
        // -> {11: 28.57142857142857, 14: 14.285714285714285, 15: 57.14285714285714}
        let mut size_ranks = SizeRankMap::new();

        for key in self.sizes.keys() {
            size_ranks.insert(
                *key,
                (*self.sizes.get(key).unwrap() as f64 / self.size_total as f64) * 100.0,
            );
        }

        // sort the ranks by percentages in decreasing order
        // -> [(15, 57.14285714285714), (11, 28.57142857142857), (14, 14.285714285714285)]
        let mut sizes = size_ranks.iter().collect::<Vec<_>>();
        sizes.sort_by(|&(_, a), &(_, b)| b.partial_cmp(a).unwrap());

        // calculate the cumulative sum of the size rankings
        // -> [(15, 57.14285714285714), (11, 85.71428571428571), (14, 100)]
        self.size_ranks = sizes
            .iter()
            .scan((0_u32, 0.00_f64), |state, &(&k, &v)| {
                *state = (k, state.1 + &v);
                Some(*state)
            })
            .collect::<Vec<(_, _)>>();
    }

    /// This function generates realistic test data based on the sampel data that was analyzed.
    ///
    /// # Example
    ///
    /// ```rust
    /// extern crate test_data_generation;
    ///
    /// use test_data_generation::Profile;
    ///
    /// fn main() {
    /// 	let mut profile =  Profile::new();
    ///
    ///		profile.analyze("One");
    ///		profile.analyze("Two");
    ///		profile.analyze("Three");
    ///		profile.analyze("Four");
    ///		profile.analyze("Five");
    ///
    ///     profile.pre_generate();
    ///
    ///		print!("The test data {:?} was generated.", profile.generate());
    /// }
    /// ```
    #[inline]
    pub fn generate(&mut self) -> String {
        // 1. get a random number
        let s: f64 = random_percentage!();

        // 2. find the first pattern that falls within the percentage chance of occurring
        // NOTE: The following 2 lines has been commented out because this doesn't need to
        //       happen since the patterns are already ranks by percent chance of occurring
        //       and therefore sizes (lengths) as well since the patterns include the full
        //       length of the entitiy analyzed.
        //let size = self.size_ranks.iter().find(|&&x|&x.1 >= &s).unwrap().0;
        //let pattern = self.pattern_ranks.iter().find(|x|&x.1 >= &s && x.0.len() == size as usize).unwrap().clone();
        let pattern = self
            .pattern_ranks
            .iter()
            .find(|x| &x.1 >= &s)
            .unwrap()
            .clone();

        // lastly, generate the test data using facts that adhere to the pattern
        self.generate_from_pattern(pattern.0)
    }

    /// This function generates realistic test data based on the sample data that was analyzed.
    ///
    /// # Arguments
    ///
    /// * `pattern: String` - The pattern to reference when generating the test data.</br>
    ///
    /// # Example
    ///
    /// ```rust
    /// extern crate test_data_generation;
    ///
    /// use test_data_generation::Profile;
    ///
    /// fn main() {
    /// 	let mut profile =  Profile::new();
    ///
    ///		profile.analyze("01/13/2017");
    ///		profile.analyze("11/24/2017");
    ///		profile.analyze("08/05/2017");
    ///
    ///     profile.pre_generate();
    ///
    ///  	let generated = profile.generate_from_pattern("##p##p####".to_string());
    ///
    ///     assert_eq!(generated.len(), 10);
    /// }
    /// ```
    #[inline]
    pub fn generate_from_pattern(&self, pattern: String) -> String {
        let pattern_chars = pattern.chars().collect::<Vec<char>>();
        let mut generated = String::new();
        let prev_char = ' ';

        // iterate through the chars in the pattern string
        for (idx, ch) in pattern_chars.iter().enumerate() {
            match crossbeam::scope(|scope| {
                let c = ch;
                let starts = if idx == 0 { 1 } else { 0 };
                let ends = if idx == pattern_chars.len() - 1 { 1 } else { 0 };
                let mut fact_options = vec![];
                let prior_char = prev_char;

                // iterate through the processors (vec) that hold the lists (vec) of facts
                for v in &self.facts {
                    let selected_facts = scope.spawn(move |_| {
                        let mut facts = vec![];

                        // iterate through the list of facts
                        for value in v {
                            if value.starts_with == starts
                                && value.ends_with == ends
                                && value.pattern_placeholder == *c
                                && value.index_offset == idx as u32
                            {
                                facts.push(value.key);

                                // if the value.key's prior char matches the prior generated char, then weight the value.key
                                // to increase the chance of it being used when generated
                                if value.prior_key.unwrap_or(' ') == prior_char {
                                    facts.push(value.key);
                                    facts.push(value.key);
                                }

                                // if the value.key's index_offset matches the current index, then weight the value.key
                                // to increase the chance of it being used when generated
                                if value.index_offset == idx as u32 {
                                    facts.push(value.key);
                                    facts.push(value.key);
                                }
                            }
                        }

                        facts
                    });

                    //append the selected_facts to the fact_options
                    //fact_options.extend_from_slice(&selected_facts.join());
                    match selected_facts.join() {
                        Ok(sf) => fact_options.extend_from_slice(&sf),
                        Err(err) => {
                            error!("{:?}", err);
                            panic!("{:?}", err);
                        }
                    }
                }

                //select a fact to use as the generated char
                let rnd_start = 0;
                let rnd_end = fact_options.len() - 1;

                if rnd_start >= rnd_end {
                    //generated.push(fact_options[0 as usize]);
                    fact_options[0_usize]
                } else {
                    let x: u32 = random_between!(rnd_start, rnd_end);
                    //prev_char = fact_options[x as usize];
                    //generated.push(prev_char);
                    fact_options[x as usize]
                }
            }) {
                Ok(c) => generated.push(c),
                Err(err) => {
                    error!("{:?}", err);
                    panic!("{:?}", err);
                }
            }
        }

        generated
    }

    /// This function learns by measuring how realistic the test data it generates to the sample data that was provided.
    ///
    /// # Arguments
    ///
    /// * `control_list: Vec<String>` - The list of strings to compare against. This would be the real data from the data sample.</br>
    ///
    /// # Example
    ///
    /// ```rust
    /// extern crate test_data_generation;
    ///
    /// use test_data_generation::Profile;
    ///
    /// fn main() {
    /// 	let mut profil =  Profile::new();
    /// 	let sample_data = vec!("Smith, John".to_string(),"Doe, John".to_string(),"Dale, Danny".to_string(),"Rickets, Ronney".to_string());
    ///
    /// 	for sample in sample_data.iter().clone() {
    /// 		profil.analyze(&sample);
    /// 	}
    ///
    /// 	// in order to learn the profile must be prepared with pre_genrate()
    ///		// so it can generate data to learn from
    ///		profil.pre_generate();
    ///
    /// 	let learning = profil.learn_from_entity(sample_data).unwrap();
    ///
    /// 	assert_eq!(learning, true);
    /// }
    /// ```
    pub fn learn_from_entity(&mut self, control_list: Vec<String>) -> Result<bool, String> {
        for _n in 0..10 {
            let experiment = self.generate();
            let mut percent_similarity: Vec<f64> = Vec::new();

            for control in control_list.iter().clone() {
                debug!("Comparing {} with {} ...", &control, &experiment);
                percent_similarity.push(self.realistic_test(&control, &experiment));
            }

            let percent =
                percent_similarity.iter().sum::<f64>() as f64 / percent_similarity.len() as f64;
            debug!("Percent similarity is {} ...", &percent);

            if percent >= 80_f64 {
                self.analyze(&experiment);
            }
        }

        Ok(true)
    }

    /// This function calculates the levenshtein distance between 2 strings.
    /// See: https://crates.io/crates/levenshtein
    ///
    /// # Arguments
    ///
    /// * `control: &String` - The string to compare against. This would be the real data from the data sample.</br>
    /// * `experiment: &String` - The string to compare. This would be the generated data for which you want to find the distance.</br>
    ///
    /// #Example
    ///
    /// ```rust
    /// extern crate test_data_generation;
    ///
    /// use test_data_generation::Profile;
    ///
    /// fn main() {
    ///		let mut profile =  Profile::new();
    ///
    ///     assert_eq!(profile.levenshtein_distance(&"kitten".to_string(), &"sitting".to_string()), 3 as usize);
    /// }
    ///
    pub fn levenshtein_distance(&mut self, control: &String, experiment: &String) -> usize {
        // https://docs.rs/levenshtein/1.0.3/levenshtein/fn.levenshtein.html
        levenshtein_distance!(control, experiment)
    }

    /// This function calculates the percent difference between 2 strings.
    ///
    /// # Arguments
    ///
    /// * `control: &str` - The string to compare against. This would be the real data from the data sample.</br>
    /// * `experiment: &str` - The string to compare. This would be the generated data for which you want to find the percent difference.</br>
    ///
    /// #Example
    ///
    /// ```rust
    /// extern crate test_data_generation;
    ///
    /// use test_data_generation::Profile;
    ///
    /// fn main() {
    ///		let mut profile =  Profile::new();
    ///
    ///     assert_eq!(profile.realistic_test(&"kitten".to_string(), &"sitting".to_string()), 76.92307692307692 as f64);
    /// }
    ///
    #[inline]
    pub fn realistic_test(&mut self, control: &str, experiment: &str) -> f64 {
        realistic_test!(control, experiment)
    }

    /// This function is called from within the implementated structure and returns a list processors (Vec) with empty lists (Vec) for their Facts.
    /// Each processor shares the load of generating the data based on the Facts it has been assigned to manage.
    ///
    /// # Arguments
    ///
    /// * `p: u8` - A number that sets the number of processors to start up to manage the Facts.</br>
    ///         Increasing the number of processors will speed up the generator be ditributing the workload.
    ///         The recommended number of processors is 1 per 10K data points (e.g.: profiling 20K names should be handled by 2 processors)</br>
    ///         NOTE: The default number of processors is 4.
    ///
    #[inline]
    fn new_facts(p: u8) -> Vec<Vec<Fact>> {
        let mut vec_main = Vec::new();

        for _ in 0..p {
            vec_main.push(Vec::new());
        }

        vec_main
    }

    /// This function prepares the size a pattern accumulated percentages order by percentage increasing
    ///
    /// # Example
    ///
    /// ```rust
    /// extern crate test_data_generation;
    ///
    /// use test_data_generation::Profile;
    ///
    /// fn main() {
    /// 	let mut profile =  Profile::new();
    ///		profile.analyze("One");
    ///		profile.analyze("Two");
    ///		profile.analyze("Three");
    ///		profile.analyze("Four");
    ///		profile.analyze("Five");
    ///		profile.analyze("Six");
    ///
    ///     profile.pre_generate();
    ///
    ///		print!("The size ranks are {:?}", profile.size_ranks);
    ///     // The size ranks are [(3, 50), (4, 83.33333333333333), (5, 100)]
    /// }
    /// ```
    pub fn pre_generate(&mut self) {
        info!("Preparing the profile for data generation...");
        self.cum_sizemap();
        self.cum_patternmap();
        info!("Profile: preparing generator...");
    }

    /// This function resets the patterns that the Profile has analyzed.
    /// Call this method whenever you wish to "clear" the Profile
    ///
    /// # Example
    ///
    /// ```rust
    /// extern crate test_data_generation;
    ///
    /// use test_data_generation::Profile;
    ///
    /// fn main() {
    /// 	let mut profile =  Profile::new();
    ///
    ///		profile.analyze("One");
    ///		profile.analyze("Two");
    ///		profile.analyze("Three");
    ///
    ///     let x = profile.patterns.len();
    ///
    ///     profile.reset_analyze();
    ///
    ///		profile.analyze("Four");
    ///		profile.analyze("Five");
    ///		profile.analyze("Six");
    ///		profile.analyze("Seven");
    ///		profile.analyze("Eight");
    ///		profile.analyze("Nine");
    ///		profile.analyze("Ten");
    ///
    ///     let y = profile.patterns.len();
    ///
    ///     assert_eq!(x, 3);
    ///     assert_eq!(y, 5);
    /// }
    /// ```
    pub fn reset_analyze(&mut self) {
        info!("Resetting the profile ...");
        self.patterns = PatternMap::new();
        info!("Profile: patterns have been reset ...");
    }

    /// This function saves (exports) the Profile to a JSON file.
    /// This is useful when you wish to reuse the algorithm to generate more test data later.
    ///
    /// # Arguments
    ///
    /// * `field: String` - The full path of the export file , excluding the file extension, (e.g.: "./test/data/custom-names").</br>
    ///
    /// #Errors
    /// If this function encounters any form of I/O or other error, an error variant will be returned.
    /// Otherwise, the function returns Ok(true).</br>
    ///
    /// #Example
    ///
    /// ```rust
    /// extern crate test_data_generation;
    ///
    /// use test_data_generation::Profile;
    ///
    /// fn main() {
    /// 	// analyze the dataset
    ///		let mut profile =  Profile::new();
    ///     profile.analyze("Smith, John");
    ///		profile.analyze("O'Brian, Henny");
    ///		profile.analyze("Dale, Danny");
    ///		profile.analyze("Rickets, Ronney");
    ///
    ///		profile.pre_generate();
    ///
    ///     assert_eq!(profile.save("./tests/samples/sample-00-profile").unwrap(), true);
    /// }
    ///
    pub fn save(&mut self, path: &'static str) -> Result<bool, io::Error> {
        let dsp_json = serde_json::to_string(&self).unwrap();

        // Create the archive file
        let mut file = match File::create(format!("{}.json", &path)) {
            Err(e) => {
                error!("Could not create file {:?}", &path.to_string());
                return Err(e);
            }
            Ok(f) => {
                info!("Successfully exported to {:?}", &path.to_string());
                f
            }
        };

        // Write the json string to file, returns io::Result<()>
        match file.write_all(dsp_json.as_bytes()) {
            Err(e) => {
                error!("Could not write to file {}", &path.to_string());
                return Err(e);
            }
            Ok(_) => {
                info!("Successfully exported to {}", &path.to_string());
            }
        };

        Ok(true)
    }

    /// This function converts the Profile to a serialize JSON string.
    ///
    /// #Example
    ///
    /// ```rust
    /// extern crate test_data_generation;
    ///
    /// use test_data_generation::Profile;
    ///
    /// fn main() {
    /// 	// analyze the dataset
    ///		let mut data_profile =  Profile::new();
    ///
    ///     // analyze the dataset
    ///		data_profile.analyze("OK");
    ///
    ///     println!("{}", data_profile.serialize());
    ///     // {"patterns":{"VC":1},"pattern_total":1,"pattern_keys":["VC"],"pattern_vals":[1],"pattern_percentages":[],"pattern_ranks":[],"sizes":{"2":1},"size_total":1,"size_ranks":[],"processors":4,"facts":[[{"key":"O","prior_key":null,"next_key":"K","pattern_placeholder":"V","starts_with":1,"ends_with":0,"index_offset":0}],[{"key":"K","prior_key":"O","next_key":null,"pattern_placeholder":"C","starts_with":0,"ends_with":1,"index_offset":1}],[],[]]}
    /// }
    ///
    pub fn serialize(&mut self) -> String {
        serde_json::to_string(&self).unwrap()
    }
}

#[macro_use]
pub mod macros;
pub mod configs;
pub mod data_sample_parser;
pub mod engine;
pub mod shared;

// Unit Tests
#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn apply_facts() {
        let mut profile = Profile::new();
        let results = PatternDefinition::new().analyze("Word");

        assert_eq!(profile.apply_facts(results.0, results.1).unwrap(), 1);
    }

    #[test]
    fn levenshtein_test() {
        let mut profil = Profile::new();

        assert_eq!(
            profil.levenshtein_distance(&"kitten".to_string(), &"sitting".to_string()),
            3 as usize
        );
    }

    #[test]
    fn realistic_data_test() {
        let mut profil = Profile::new();

        assert_eq!(
            profil.realistic_test(&"kitten".to_string(), &"sitting".to_string()),
            76.92307692307692 as f64
        );
    }

    #[test]
    fn learn_from_entity() {
        let mut profil = Profile::new();
        let sample_data = vec![
            "Smith, John".to_string(),
            "Doe, John".to_string(),
            "Dale, Danny".to_string(),
            "Rickets, Ronney".to_string(),
        ];

        for sample in sample_data.iter().clone() {
            profil.analyze(&sample);
        }

        profil.pre_generate();

        let learning = profil.learn_from_entity(sample_data).unwrap();

        assert_eq!(learning, true);
    }

    #[test]
    fn logging_test() {
        let mut profile = Profile::new();
        profile.reset_analyze();

        assert!(true);
    }

    #[test]
    fn new_profile_with_id() {
        let mut profile = Profile::new_with_id("12345".to_string());
        profile.pre_generate();

        assert_eq!(profile.id.unwrap(), "12345".to_string());
    }

    #[test]
    fn new_profile_from_file() {
        let mut profile = Profile::from_file("./tests/samples/sample-00-profile");
        profile.pre_generate();

        assert!(profile.generate().len() > 0);
    }

    #[test]
    #[should_panic]
    fn new_profile_from_file_bad_data() {
        let mut profile = Profile::from_file("./tests/samples/not-readable");
        profile.pre_generate();

        assert!(profile.generate().len() > 0);
    }

    #[test]
    #[should_panic(expected = "Could not open file \"./tests/samples/bad-path\"")]
    fn new_profile_from_file_bad_path() {
        let mut profile = Profile::from_file("./tests/samples/bad-path");
        profile.pre_generate();

        assert!(profile.generate().len() > 0);
    }

    #[test]
    fn new_profile_from_serialized() {
        let serialized = "{\"patterns\":{\"VC\":1},\"pattern_total\":1,\"pattern_keys\":[\"VC\"],\"pattern_vals\":[1],\"pattern_percentages\":[],\"pattern_ranks\":[],\"sizes\":{\"2\":1},\"size_total\":1,\"size_ranks\":[],\"processors\":4,\"facts\":[[{\"key\":\"O\",\"prior_key\":null,\"next_key\":\"K\",\"pattern_placeholder\":\"V\",\"starts_with\":1,\"ends_with\":0,\"index_offset\":0}],[{\"key\":\"K\",\"prior_key\":\"O\",\"next_key\":null,\"pattern_placeholder\":\"C\",\"starts_with\":0,\"ends_with\":1,\"index_offset\":1}],[],[]]}";
        let mut profile = Profile::from_serialized(&serialized);
        profile.pre_generate();

        assert_eq!(profile.generate(), "OK");
    }

    #[test]
    fn new_profile_new_with() {
        let profile = Profile::new_with_processors(10);

        assert_eq!(profile.processors, 10);
    }

    #[test]
    // ensure Profile is analyzing all the sample data points
    fn profile_analyze() {
        let mut profil = Profile::new();
        profil.analyze("Smith, John");
        profil.analyze("O'Brian, Henny");
        profil.analyze("Dale, Danny");
        profil.analyze("Rickets, Ronney");

        assert_eq!(profil.patterns.len(), 4);
    }

    #[test]
    // ensure Profile is able to find the facts that relate to a pattern
    // NOTE: Dates need work! e.g.: 00/15/0027
    fn profile_generate_from_pattern_date() {
        let mut profil = Profile::new();
        profil.analyze("01/13/2017");
        profil.analyze("11/24/2017");
        profil.analyze("08/05/2017");

        profil.pre_generate();
        let generated = profil.generate_from_pattern("##p##p####".to_string());

        assert_eq!(10, generated.len());
    }

    #[test]
    // ensure Profile is able to find the facts that relate to a pattern
    fn profile_generate_from_pattern_string() {
        let mut profil = Profile::new();
        profil.analyze("First");
        profil.analyze("Next");
        profil.analyze("Last");

        profil.pre_generate();
        let generated = profil.generate_from_pattern("Cvcc".to_string());

        assert_eq!(4, generated.len());
    }

    #[test]
    // ensure Profile is generating correct test data
    fn profile_generate() {
        let mut profil = Profile::new();
        profil.analyze("Smith, John");
        profil.analyze("O'Brian, Henny");
        profil.analyze("Dale, Danny");
        profil.analyze("Rickets, Ronnae");
        profil.analyze("Richard, Richie");
        profil.analyze("Roberts, Blake");
        profil.analyze("Conways, Sephen");

        profil.pre_generate();

        assert!(profil.generate().len() > 10);
    }

    #[test]
    // issue #31
    // ensure Profile doesn't generate a name with a backslash preceding an apostrophe
    fn profile_generate_with_apostrophe() {
        let mut profil = Profile::new();
        profil.analyze("O'Brien");

        profil.pre_generate();
        let generated = profil.generate();

        assert_eq!(generated, "O'Brien");
    }

    #[test]
    // ensure Profile is providing the correct pattern ranks after analyzing the sample data
    fn profile_pregenerate_patterns() {
        let mut profil = Profile::new();
        profil.analyze("Smith, John");
        profil.analyze("O'Brian, Henny");
        profil.analyze("Dale, Danny");
        profil.analyze("Rickets, Ronnae");
        profil.analyze("Richard, Richie");
        profil.analyze("Roberts, Blake");
        profil.analyze("Conways, Sephen");

        profil.pre_generate();
        let test = [
            ("CvccvccpSCvccvv".to_string(), 28.57142857142857 as f64),
            ("CcvccpSCvcc".to_string(), 42.857142857142854 as f64),
            ("CvccvccpSCvccvc".to_string(), 57.14285714285714 as f64),
            ("CvcvcccpSCcvcv".to_string(), 71.42857142857142 as f64),
            ("CvcvpSCvccc".to_string(), 85.7142857142857 as f64),
            ("V@CcvvcpSCvccc".to_string(), 99.99999999999997 as f64),
        ];

        assert_eq!(profil.pattern_ranks, test);
    }

    #[test]
    // ensure Profile is providing the correct pattern ranks after analyzing the sample data
    fn profile_pregenerate_sizes() {
        let mut profil = Profile::new();

        profil.analyze("Smith, Johny"); //12
        profil.analyze("O'Brian, Hen"); //12
        profil.analyze("Dale, Danny"); //11
        profil.analyze("O'Henry, Al"); //11
        profil.analyze("Rickets, Ro"); //11
        profil.analyze("Mr. Wilbers"); //11
        profil.analyze("Po, Al"); //6

        profil.pre_generate();
        let test = [
            (11, 57.14285714285714),
            (12, 85.71428571428571),
            (6, 100 as f64),
        ];

        assert_eq!(profil.size_ranks, test);
    }

    #[test]
    fn save_profile() {
        let mut profile = Profile::new();
        profile.analyze("Smith, John");
        profile.analyze("O'Brian, Henny");
        profile.analyze("Dale, Danny");
        profile.analyze("Rickets, Ronney");

        profile.pre_generate();

        assert_eq!(
            profile.save("./tests/samples/sample-00-profile").unwrap(),
            true
        );
    }

    #[test]
    // ensure a Profile can be exported (to be archived) as JSON
    fn serialize() {
        let mut profil = Profile::new();

        // analyze the dataset
        profil.analyze("OK");

        let serialized = profil.serialize();
        assert_eq!(serialized, "{\"id\":null,\"patterns\":{\"VC\":1},\"pattern_total\":1,\"pattern_keys\":[\"VC\"],\"pattern_vals\":[1],\"pattern_percentages\":[],\"pattern_ranks\":[],\"sizes\":{\"2\":1},\"size_total\":1,\"size_ranks\":[],\"processors\":4,\"facts\":[[{\"key\":\"O\",\"prior_key\":null,\"next_key\":\"K\",\"pattern_placeholder\":\"V\",\"starts_with\":1,\"ends_with\":0,\"index_offset\":0}],[{\"key\":\"K\",\"prior_key\":\"O\",\"next_key\":null,\"pattern_placeholder\":\"C\",\"starts_with\":0,\"ends_with\":1,\"index_offset\":1}],[],[]]}");
    }
}