// Copyright 2018 David Sietz and [`test-data-generator` contributors](https://github.com/dsietz/test-data-generator/blob/master/CONTRIBUTORS.md).
// Licensed under the MIT license
// (see LICENSE or <https://opensource.org/licenses/Apache-2.0>)
//
//!
//! The are multiple ways to use the Test Data Generation library. It all depends on your intent.
//!
//! ### Profile
//!
//! The easiest way is to use a Profile. The `profile` module provides functionality to create a profile on a data sample (Strings).
//! Once a profile has been made, data can be generated by calling the _pre_generate()_ and _generate()_ functions, in that order.
//!
//! ```
//! extern crate test_data_generation;
//!
//! use test_data_generation::Profile;
//!
//! fn main() {
//! // analyze the dataset
//! let mut data_profile = Profile::new();
//!
//! // analyze the dataset
//! data_profile.analyze("Smith, John");
//! data_profile.analyze("Doe, John");
//! data_profile.analyze("Dale, Danny");
//! data_profile.analyze("Rickets, Ronney");
//!
//! // confirm 4 data samples were analyzed
//! assert_eq!(data_profile.patterns.len(), 4);
//!
//! // prepare the generator
//! data_profile.pre_generate();
//!
//! // generate some data
//! println!("The generated name is {:?}", data_profile.generate());
//! }
//! ```
//!
//! You can also export (archive as JSON file) the profile for later use.
//! This allows for the algorithm to be retrieved without having to store the actual data that was analyzed.
//!
//! ```
//! extern crate test_data_generation;
//!
//! use test_data_generation::Profile;
//!
//! fn main() {
//! //create a profile and analyze some data
//! let mut old_profile = Profile::new();
//! old_profile.analyze("Smith, John");
//! old_profile.analyze("O'Brian, Henny");
//! old_profile.analyze("Dale, Danny");
//! old_profile.analyze("Rickets, Ronney");
//!
//! old_profile.pre_generate();
//!
//! //save the profile for later
//! assert_eq!(old_profile.save("./tests/samples/sample-00-profile").unwrap(), true);
//!
//! // create a new profile from the archive json file
//! let mut new_profile = Profile::from_file("./tests/samples/sample-00-profile");
//!
//! // generate some data. NOTE that the pre-generate() was already called prior to saving
//! println!("The generated name is {:?}", new_profile.generate());
//! }
//! ```
//!
//! ### Data Sample Parser
//!
//! If you are using CSV files of data samples, then you may wish to use a Data Sample Parser.
//! The `data_sample_parser` module provides functionality to read sample data, parse and analyze it, so that test data can be generated based on profiles.
//!
//! ```
//! extern crate test_data_generation;
//! use test_data_generation::data_sample_parser::DataSampleParser;
//!
//! fn main() {
//! let mut dsp = DataSampleParser::new();
//! dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None).unwrap();
//!
//! println!("My new name is {} {}", dsp.generate_record()[0], dsp.generate_record()[1]);
//! // My new name is Abbon Aady
//! }
//! ```
//!
//! You can also save the Data Sample Parser (the algorithm) as an archive file (json) ...
//!
//! ```
//! extern crate test_data_generation;
//! use test_data_generation::data_sample_parser::DataSampleParser;
//!
//! fn main() {
//! let mut dsp = DataSampleParser::new();
//! dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None).unwrap();
//!
//! assert_eq!(dsp.save(&String::from("./tests/samples/sample-01-dsp")).unwrap(), true);
//! }
//! ```
//!
//! and use it at a later time.
//!
//! ```
//! extern crate test_data_generation;
//! use test_data_generation::data_sample_parser::DataSampleParser;
//!
//! fn main() {
//! let mut dsp = DataSampleParser::from_file(&String::from("./tests/samples/sample-01-dsp"));
//!
//! println!("Sample data is {:?}", dsp.generate_record()[0]);
//! }
//! ```
//!
//! You can also generate a new csv file based on the data sample provided.
//!
//! ```
//! extern crate test_data_generation;
//!
//! use test_data_generation::data_sample_parser::DataSampleParser;
//!
//! fn main() {
//! let mut dsp = DataSampleParser::new();
//!
//! // Use the default delimiter (comma)
//! dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None).unwrap();
//! dsp.generate_csv(100, &String::from("./tests/samples/generated-01.csv"), None).unwrap();
//! }
//! ```
#![crate_type = "lib"]
#![crate_name = "test_data_generation"]
#[macro_use]
extern crate log;
#[macro_use]
extern crate serde_derive;
extern crate crossbeam;
extern crate csv;
extern crate indexmap;
extern crate levenshtein;
extern crate rand;
extern crate regex;
extern crate serde;
extern crate serde_json;
extern crate serde_yaml;
extern crate yaml_rust;
use crate::engine::{Fact, PatternDefinition};
use std::collections::BTreeMap;
use std::fs::File;
use std::io;
use std::io::prelude::*;
use std::io::Write;
use std::ops::AddAssign;
type PatternMap = BTreeMap<String, u32>;
type SizeMap = BTreeMap<u32, u32>;
type SizeRankMap = BTreeMap<u32, f64>;
#[derive(Clone, Serialize, Deserialize, Debug)]
/// Represents a Profile for sample data that has been analyzed and can be used to generate realistic data
pub struct Profile {
/// An identifier (not necessarily unique) that is used to differentiate profiles from one another
pub id: Option<String>,
/// A list of symbolic patterns with a distinct count of occurrences
pub patterns: PatternMap,
/// The total number of patterns in the profile
pub pattern_total: u32,
/// A list of symbolic patterns in the profile
/// (used for temporary storage due to lifetime issues)
pub pattern_keys: Vec<String>,
/// A list of distinct counts for patterns in the profile
/// (used for temporary storage due to lifetime issues)
pub pattern_vals: Vec<u32>,
/// A list of symbolic patterns with their percent chance of occurrence
pub pattern_percentages: Vec<(String, f64)>,
/// A list of symbolic patterns with a running total of percent chance of occurrence, in increasing order
pub pattern_ranks: Vec<(String, f64)>,
/// A list of pattern lengths with a distinct count of occurrence
pub sizes: SizeMap,
/// the total number of pattern sizes (lengths) in the profile
pub size_total: u32,
/// A list of pattern sizes (lengths) with a running total of their percent chance of occurrence, in increasing order
pub size_ranks: Vec<(u32, f64)>,
/// The number of processors used to distribute the work load (multi-thread) while finding Facts to generate data
pub processors: u8,
/// A list of processors (which are lists of Facts) that store all the Facts in the profile
pub facts: Vec<Vec<Fact>>,
}
impl Profile {
/// Constructs a new Profile
///
/// #Example
///
/// ```rust
/// extern crate test_data_generation;
///
/// use test_data_generation::Profile;
///
/// fn main() {
/// let placeholder = Profile::new();
/// }
/// ```
pub fn new() -> Profile {
Profile {
id: None,
patterns: PatternMap::new(),
pattern_total: 0,
pattern_keys: Vec::new(),
pattern_vals: Vec::new(),
pattern_percentages: Vec::new(),
pattern_ranks: Vec::new(),
sizes: SizeMap::new(),
size_total: 0,
size_ranks: Vec::new(),
processors: 4,
facts: Profile::new_facts(4),
}
}
/// Constructs a new Profile using an identifier
///
/// #Example
///
/// ```rust
/// extern crate test_data_generation;
///
/// use test_data_generation::Profile;
///
/// fn main() {
/// let placeholder = Profile::new_with_id("12345".to_string());
/// }
/// ```
pub fn new_with_id(id: String) -> Profile {
Profile {
id: Some(id),
patterns: PatternMap::new(),
pattern_total: 0,
pattern_keys: Vec::new(),
pattern_vals: Vec::new(),
pattern_percentages: Vec::new(),
pattern_ranks: Vec::new(),
sizes: SizeMap::new(),
size_total: 0,
size_ranks: Vec::new(),
processors: 4,
facts: Profile::new_facts(4),
}
}
/// Constructs a new Profile with a specified number of processors to analyze the data.
/// Each processor shares the load of generating the data based on the Facts it has been assigned to manage.
///
/// # Arguments
///
/// * `p: u8` - A number that sets the number of processors to start up to manage the Facts.</br>
/// Increasing the number of processors will speed up the generator be distributing the workload.
/// The recommended number of processors is 1 per 10K data points (e.g.: profiling 20K names should be handled by 2 processors)</br>
/// NOTE: The default number of processors is 4.
///
/// #Example
///
/// ```rust
/// extern crate test_data_generation;
///
/// use test_data_generation::Profile;
///
/// fn main() {
/// let processors: u8 = 10;
/// let placeholder = Profile::new_with_processors(processors);
/// }
/// ```
pub fn new_with_processors(p: u8) -> Profile {
Profile {
id: None,
patterns: PatternMap::new(),
pattern_total: 0,
pattern_keys: Vec::new(),
pattern_vals: Vec::new(),
pattern_percentages: Vec::new(),
pattern_ranks: Vec::new(),
sizes: SizeMap::new(),
size_total: 0,
size_ranks: Vec::new(),
processors: p,
facts: Profile::new_facts(p),
}
}
/// Constructs a new Profile from an exported JSON file. This is used when restoring from "archive"
///
/// # Arguments
///
/// * `path: &str` - The full path of the export file , excluding the file extension, (e.g.: "./test/data/custom-names").</br>
///
/// #Example
///
/// ```rust
/// extern crate test_data_generation;
///
/// use test_data_generation::Profile;
///
/// fn main() {
/// let mut profile = Profile::from_file("./tests/samples/sample-00-profile");
///
/// profile.pre_generate();
///
/// println!("The generated name is {:?}", profile.generate());
/// }
/// ```
pub fn from_file(path: &'static str) -> Profile {
// open the archive file
let mut file = match File::open(format!("{}.json", &path)) {
Err(_e) => {
error!("Could not open file {:?}", &path.to_string());
panic!("Could not open file {:?}", &path.to_string());
}
Ok(f) => {
info!("Successfully opened file {:?}", &path.to_string());
f
}
};
//read the archive file
let mut serialized = String::new();
match file.read_to_string(&mut serialized) {
Err(e) => {
error!(
"Could not read file {:?} because of {:?}",
&path.to_string(),
e.to_string()
);
panic!(
"Could not read file {:?} because of {:?}",
&path.to_string(),
e.to_string()
);
}
Ok(s) => {
info!("Successfully read file {:?}", &path.to_string());
s
}
};
//serde_json::from_str(&serialized).unwrap()
Self::from_serialized(&serialized)
}
/// Constructs a new Profile from a serialized (JSON) string of the Profile object. This is used when restoring from "archive"
///
/// #Example
///
/// ```rust
/// extern crate test_data_generation;
///
/// use test_data_generation::Profile;
///
/// fn main() {
/// let serialized = "{\"patterns\":{\"VC\":1},\"pattern_total\":1,\"pattern_keys\":[\"VC\"],\"pattern_vals\":[1],\"pattern_percentages\":[],\"pattern_ranks\":[],\"sizes\":{\"2\":1},\"size_total\":1,\"size_ranks\":[],\"processors\":4,\"facts\":[[{\"key\":\"O\",\"prior_key\":null,\"next_key\":\"K\",\"pattern_placeholder\":\"V\",\"starts_with\":1,\"ends_with\":0,\"index_offset\":0}],[{\"key\":\"K\",\"prior_key\":\"O\",\"next_key\":null,\"pattern_placeholder\":\"C\",\"starts_with\":0,\"ends_with\":1,\"index_offset\":1}],[],[]]}";
/// let mut profile = Profile::from_serialized(&serialized);
///
/// profile.pre_generate();
///
/// println!("The generated name is {:?}", profile.generate());
/// }
/// ```
pub fn from_serialized(serialized: &str) -> Profile {
serde_json::from_str(&serialized).unwrap()
}
/// This function converts an data point (&str) to a pattern and adds it to the profile
///
/// # Arguments
///
/// * `entity: String` - The textual str of the value to analyze.</br>
///
/// # Example
///
/// ```rust
/// extern crate test_data_generation;
///
/// use test_data_generation::Profile;
///
/// fn main() {
/// let mut profile = Profile::new();
/// profile.analyze("One");
/// profile.analyze("Two");
/// profile.analyze("Three");
/// profile.analyze("Four");
///
/// assert_eq!(profile.patterns.len(), 4);
/// }
/// ```
pub fn analyze(&mut self, entity: &str) {
let rslt = PatternDefinition::new().analyze(entity);
let _t = self.apply_facts(rslt.0, rslt.1).map_err(|e| {
error!(
"Warning: Couldn't apply the pattern and facts for the entity {}!",
entity
);
e.to_string()
});
}
/// This function applies the pattern and list of Facts to the profile
///
/// # Arguments
///
/// * `pattern: String` - The string the represents the pattern of the entity that was analyzed.</br>
/// * `facts: Vec<Fact>` - A Vector containing the Facts based on the analysis (one for each char in the entity).</br>
///
/// # Example
///
/// ```rust
/// extern crate test_data_generation;
///
/// use test_data_generation::engine::{Fact, PatternDefinition};
/// use test_data_generation::Profile;
///
/// fn main() {
/// let mut profile = Profile::new();
/// let results = PatternDefinition::new().analyze("Word");
///
/// assert_eq!(profile.apply_facts(results.0, results.1).unwrap(), 1);
/// }
/// ```
#[inline]
pub fn apply_facts(&mut self, pattern: String, facts: Vec<Fact>) -> Result<i32, String> {
// balance the storing of facts across all the vectors that can be processed in parallel
let mut i = 0;
for f in facts.into_iter() {
if i == self.processors {
i = 0;
}
self.facts[i as usize].push(f);
i += 1;
}
// store the pattern
AddAssign::add_assign(self.patterns.entry(pattern.to_string()).or_insert(0), 1);
// store the total number of patterns generated so far
self.pattern_total = self.patterns.values().sum::<u32>();
// analyze sizes
AddAssign::add_assign(self.sizes.entry(pattern.len() as u32).or_insert(0), 1);
self.size_total = self.sizes.values().sum::<u32>();
self.pattern_keys = self.patterns.keys().cloned().collect();
self.pattern_vals = self.patterns.values().cloned().collect();
Ok(1)
}
/// This function calculates the patterns to use by the chance they will occur (as cumulative percentage) in decreasing order
///
/// # Example
///
/// ```rust
/// extern crate test_data_generation;
///
/// use test_data_generation::Profile;
///
/// fn main() {
/// let mut profile = Profile::new();
///
/// profile.analyze("Smith, John");
/// profile.analyze("O'Brian, Henny");
/// profile.analyze("Dale, Danny");
/// profile.analyze("Rickets, Ronnae");
/// profile.analyze("Richard, Richie");
/// profile.analyze("Roberts, Blake");
/// profile.analyze("Conways, Sephen");
///
/// profile.pre_generate();
/// let test = [("CvccvccpSCvccvv".to_string(), 28.57142857142857 as f64), ("CcvccpSCvcc".to_string(), 42.857142857142854 as f64), ("CvccvccpSCvccvc".to_string(), 57.14285714285714 as f64), ("CvcvcccpSCcvcv".to_string(), 71.42857142857142 as f64), ("CvcvpSCvccc".to_string(), 85.7142857142857 as f64), ("V@CcvvcpSCvccc".to_string(), 99.99999999999997 as f64)];
///
/// assert_eq!(profile.pattern_ranks, test);
/// }
/// ```
#[inline]
pub fn cum_patternmap(&mut self) {
// Reference: https://users.rust-lang.org/t/cannot-infer-an-appropriate-lifetime-for-autoref/13360/3
debug!("calculating the cumulative percentage of occurences for data point patterns...");
// calculate the percentage by patterns
// -> {"CcvccpSCvcc": 14.285714285714285, "CvccvccpSCvccvc": 14.285714285714285, "CvccvccpSCvccvv": 28.57142857142857, "CvcvcccpSCcvcv": 14.285714285714285, "CvcvpSCvccc": 14.285714285714285, "V~CcvvcpSCvccc": 14.285714285714285}
let n = self.patterns.len();
// see issue: https://github.com/dsietz/test-data-generation/issues/88
self.pattern_percentages.clear();
for m in 0..n {
self.pattern_percentages.push((
self.pattern_keys[m].clone(),
(self.pattern_vals[m] as f64 / self.pattern_total as f64) * 100.0,
));
}
// sort the ranks by percentages in decreasing order
// -> [("CvccvccpSCvccvv", 28.57142857142857), ("CcvccpSCvcc", 14.285714285714285), ("CvccvccpSCvccvc", 14.285714285714285), ("CvcvcccpSCcvcv", 14.285714285714285), ("CvcvpSCvccc", 14.285714285714285), ("V~CcvvcpSCvccc", 14.285714285714285)]
self.pattern_percentages
.sort_by(|&(_, a), &(_, b)| b.partial_cmp(&a).unwrap());
// calculate the cumulative sum of the pattern rankings
// -> [("CvccvccpSCvccvv", 28.57142857142857), ("CcvccpSCvcc", 42.857142857142854), ("CvccvccpSCvccvc", 57.14285714285714), ("CvcvcccpSCcvcv", 71.42857142857142), ("CvcvpSCvccc", 85.7142857142857), ("V~CcvvcpSCvccc", 99.99999999999997)]
let mut rank: f64 = 0.00;
// see issue: https://github.com/dsietz/test-data-generation/issues/88
self.pattern_ranks.clear();
for pttrn in self.pattern_percentages.iter() {
let tmp = pttrn.1 + rank;
self.pattern_ranks.push((pttrn.0.clone(), tmp));
rank = tmp;
}
}
/// This function calculates the sizes to use by the chance they will occur (as cumulative percentage) in decreasing order
///
/// # Example
///
/// ```rust
/// extern crate test_data_generation;
///
/// use test_data_generation::Profile;
///
/// fn main() {
/// let mut profile = Profile::new();
/// profile.analyze("One");
/// profile.analyze("Two");
/// profile.analyze("Three");
/// profile.analyze("Four");
/// profile.analyze("Five");
/// profile.analyze("Six");
///
/// profile.cum_sizemap();
///
/// print!("The size ranks are {:?}", profile.size_ranks);
/// // The size ranks are [(3, 50), (4, 83.33333333333333), (5, 100)]
/// }
/// ```
#[inline]
pub fn cum_sizemap(&mut self) {
debug!("calculating the cumulative percentage of occurences for data point sizes...");
// calculate the percentage by sizes
// -> {11: 28.57142857142857, 14: 14.285714285714285, 15: 57.14285714285714}
let mut size_ranks = SizeRankMap::new();
for key in self.sizes.keys() {
size_ranks.insert(
*key,
(*self.sizes.get(key).unwrap() as f64 / self.size_total as f64) * 100.0,
);
}
// sort the ranks by percentages in decreasing order
// -> [(15, 57.14285714285714), (11, 28.57142857142857), (14, 14.285714285714285)]
let mut sizes = size_ranks.iter().collect::<Vec<_>>();
sizes.sort_by(|&(_, a), &(_, b)| b.partial_cmp(a).unwrap());
// calculate the cumulative sum of the size rankings
// -> [(15, 57.14285714285714), (11, 85.71428571428571), (14, 100)]
self.size_ranks = sizes
.iter()
.scan((0_u32, 0.00_f64), |state, &(&k, &v)| {
*state = (k, state.1 + &v);
Some(*state)
})
.collect::<Vec<(_, _)>>();
}
/// This function generates realistic test data based on the sampel data that was analyzed.
///
/// # Example
///
/// ```rust
/// extern crate test_data_generation;
///
/// use test_data_generation::Profile;
///
/// fn main() {
/// let mut profile = Profile::new();
///
/// profile.analyze("One");
/// profile.analyze("Two");
/// profile.analyze("Three");
/// profile.analyze("Four");
/// profile.analyze("Five");
///
/// profile.pre_generate();
///
/// print!("The test data {:?} was generated.", profile.generate());
/// }
/// ```
#[inline]
pub fn generate(&mut self) -> String {
// 1. get a random number
let s: f64 = random_percentage!();
// 2. find the first pattern that falls within the percentage chance of occurring
// NOTE: The following 2 lines has been commented out because this doesn't need to
// happen since the patterns are already ranks by percent chance of occurring
// and therefore sizes (lengths) as well since the patterns include the full
// length of the entitiy analyzed.
//let size = self.size_ranks.iter().find(|&&x|&x.1 >= &s).unwrap().0;
//let pattern = self.pattern_ranks.iter().find(|x|&x.1 >= &s && x.0.len() == size as usize).unwrap().clone();
let pattern = self
.pattern_ranks
.iter()
.find(|x| &x.1 >= &s)
.unwrap()
.clone();
// lastly, generate the test data using facts that adhere to the pattern
self.generate_from_pattern(pattern.0)
}
/// This function generates realistic test data based on the sample data that was analyzed.
///
/// # Arguments
///
/// * `pattern: String` - The pattern to reference when generating the test data.</br>
///
/// # Example
///
/// ```rust
/// extern crate test_data_generation;
///
/// use test_data_generation::Profile;
///
/// fn main() {
/// let mut profile = Profile::new();
///
/// profile.analyze("01/13/2017");
/// profile.analyze("11/24/2017");
/// profile.analyze("08/05/2017");
///
/// profile.pre_generate();
///
/// let generated = profile.generate_from_pattern("##p##p####".to_string());
///
/// assert_eq!(generated.len(), 10);
/// }
/// ```
#[inline]
pub fn generate_from_pattern(&self, pattern: String) -> String {
let pattern_chars = pattern.chars().collect::<Vec<char>>();
let mut generated = String::new();
let prev_char = ' ';
// iterate through the chars in the pattern string
for (idx, ch) in pattern_chars.iter().enumerate() {
match crossbeam::scope(|scope| {
let c = ch;
let starts = if idx == 0 { 1 } else { 0 };
let ends = if idx == pattern_chars.len() - 1 { 1 } else { 0 };
let mut fact_options = vec![];
let prior_char = prev_char;
// iterate through the processors (vec) that hold the lists (vec) of facts
for v in &self.facts {
let selected_facts = scope.spawn(move |_| {
let mut facts = vec![];
// iterate through the list of facts
for value in v {
if value.starts_with == starts
&& value.ends_with == ends
&& value.pattern_placeholder == *c
&& value.index_offset == idx as u32
{
facts.push(value.key);
// if the value.key's prior char matches the prior generated char, then weight the value.key
// to increase the chance of it being used when generated
if value.prior_key.unwrap_or(' ') == prior_char {
facts.push(value.key);
facts.push(value.key);
}
// if the value.key's index_offset matches the current index, then weight the value.key
// to increase the chance of it being used when generated
if value.index_offset == idx as u32 {
facts.push(value.key);
facts.push(value.key);
}
}
}
facts
});
//append the selected_facts to the fact_options
//fact_options.extend_from_slice(&selected_facts.join());
match selected_facts.join() {
Ok(sf) => fact_options.extend_from_slice(&sf),
Err(err) => {
error!("{:?}", err);
panic!("{:?}", err);
}
}
}
//select a fact to use as the generated char
let rnd_start = 0;
let rnd_end = fact_options.len() - 1;
if rnd_start >= rnd_end {
//generated.push(fact_options[0 as usize]);
fact_options[0_usize]
} else {
let x: u32 = random_between!(rnd_start, rnd_end);
//prev_char = fact_options[x as usize];
//generated.push(prev_char);
fact_options[x as usize]
}
}) {
Ok(c) => generated.push(c),
Err(err) => {
error!("{:?}", err);
panic!("{:?}", err);
}
}
}
generated
}
/// This function learns by measuring how realistic the test data it generates to the sample data that was provided.
///
/// # Arguments
///
/// * `control_list: Vec<String>` - The list of strings to compare against. This would be the real data from the data sample.</br>
///
/// # Example
///
/// ```rust
/// extern crate test_data_generation;
///
/// use test_data_generation::Profile;
///
/// fn main() {
/// let mut profil = Profile::new();
/// let sample_data = vec!("Smith, John".to_string(),"Doe, John".to_string(),"Dale, Danny".to_string(),"Rickets, Ronney".to_string());
///
/// for sample in sample_data.iter().clone() {
/// profil.analyze(&sample);
/// }
///
/// // in order to learn the profile must be prepared with pre_genrate()
/// // so it can generate data to learn from
/// profil.pre_generate();
///
/// let learning = profil.learn_from_entity(sample_data).unwrap();
///
/// assert_eq!(learning, true);
/// }
/// ```
pub fn learn_from_entity(&mut self, control_list: Vec<String>) -> Result<bool, String> {
for _n in 0..10 {
let experiment = self.generate();
let mut percent_similarity: Vec<f64> = Vec::new();
for control in control_list.iter().clone() {
debug!("Comparing {} with {} ...", &control, &experiment);
percent_similarity.push(self.realistic_test(&control, &experiment));
}
let percent =
percent_similarity.iter().sum::<f64>() as f64 / percent_similarity.len() as f64;
debug!("Percent similarity is {} ...", &percent);
if percent >= 80_f64 {
self.analyze(&experiment);
}
}
Ok(true)
}
/// This function calculates the levenshtein distance between 2 strings.
/// See: https://crates.io/crates/levenshtein
///
/// # Arguments
///
/// * `control: &String` - The string to compare against. This would be the real data from the data sample.</br>
/// * `experiment: &String` - The string to compare. This would be the generated data for which you want to find the distance.</br>
///
/// #Example
///
/// ```rust
/// extern crate test_data_generation;
///
/// use test_data_generation::Profile;
///
/// fn main() {
/// let mut profile = Profile::new();
///
/// assert_eq!(profile.levenshtein_distance(&"kitten".to_string(), &"sitting".to_string()), 3 as usize);
/// }
///
pub fn levenshtein_distance(&mut self, control: &String, experiment: &String) -> usize {
// https://docs.rs/levenshtein/1.0.3/levenshtein/fn.levenshtein.html
levenshtein_distance!(control, experiment)
}
/// This function calculates the percent difference between 2 strings.
///
/// # Arguments
///
/// * `control: &str` - The string to compare against. This would be the real data from the data sample.</br>
/// * `experiment: &str` - The string to compare. This would be the generated data for which you want to find the percent difference.</br>
///
/// #Example
///
/// ```rust
/// extern crate test_data_generation;
///
/// use test_data_generation::Profile;
///
/// fn main() {
/// let mut profile = Profile::new();
///
/// assert_eq!(profile.realistic_test(&"kitten".to_string(), &"sitting".to_string()), 76.92307692307692 as f64);
/// }
///
#[inline]
pub fn realistic_test(&mut self, control: &str, experiment: &str) -> f64 {
realistic_test!(control, experiment)
}
/// This function is called from within the implementated structure and returns a list processors (Vec) with empty lists (Vec) for their Facts.
/// Each processor shares the load of generating the data based on the Facts it has been assigned to manage.
///
/// # Arguments
///
/// * `p: u8` - A number that sets the number of processors to start up to manage the Facts.</br>
/// Increasing the number of processors will speed up the generator be ditributing the workload.
/// The recommended number of processors is 1 per 10K data points (e.g.: profiling 20K names should be handled by 2 processors)</br>
/// NOTE: The default number of processors is 4.
///
#[inline]
fn new_facts(p: u8) -> Vec<Vec<Fact>> {
let mut vec_main = Vec::new();
for _ in 0..p {
vec_main.push(Vec::new());
}
vec_main
}
/// This function prepares the size a pattern accumulated percentages order by percentage increasing
///
/// # Example
///
/// ```rust
/// extern crate test_data_generation;
///
/// use test_data_generation::Profile;
///
/// fn main() {
/// let mut profile = Profile::new();
/// profile.analyze("One");
/// profile.analyze("Two");
/// profile.analyze("Three");
/// profile.analyze("Four");
/// profile.analyze("Five");
/// profile.analyze("Six");
///
/// profile.pre_generate();
///
/// print!("The size ranks are {:?}", profile.size_ranks);
/// // The size ranks are [(3, 50), (4, 83.33333333333333), (5, 100)]
/// }
/// ```
pub fn pre_generate(&mut self) {
info!("Preparing the profile for data generation...");
self.cum_sizemap();
self.cum_patternmap();
info!("Profile: preparing generator...");
}
/// This function resets the patterns that the Profile has analyzed.
/// Call this method whenever you wish to "clear" the Profile
///
/// # Example
///
/// ```rust
/// extern crate test_data_generation;
///
/// use test_data_generation::Profile;
///
/// fn main() {
/// let mut profile = Profile::new();
///
/// profile.analyze("One");
/// profile.analyze("Two");
/// profile.analyze("Three");
///
/// let x = profile.patterns.len();
///
/// profile.reset_analyze();
///
/// profile.analyze("Four");
/// profile.analyze("Five");
/// profile.analyze("Six");
/// profile.analyze("Seven");
/// profile.analyze("Eight");
/// profile.analyze("Nine");
/// profile.analyze("Ten");
///
/// let y = profile.patterns.len();
///
/// assert_eq!(x, 3);
/// assert_eq!(y, 5);
/// }
/// ```
pub fn reset_analyze(&mut self) {
info!("Resetting the profile ...");
self.patterns = PatternMap::new();
info!("Profile: patterns have been reset ...");
}
/// This function saves (exports) the Profile to a JSON file.
/// This is useful when you wish to reuse the algorithm to generate more test data later.
///
/// # Arguments
///
/// * `field: String` - The full path of the export file , excluding the file extension, (e.g.: "./test/data/custom-names").</br>
///
/// #Errors
/// If this function encounters any form of I/O or other error, an error variant will be returned.
/// Otherwise, the function returns Ok(true).</br>
///
/// #Example
///
/// ```rust
/// extern crate test_data_generation;
///
/// use test_data_generation::Profile;
///
/// fn main() {
/// // analyze the dataset
/// let mut profile = Profile::new();
/// profile.analyze("Smith, John");
/// profile.analyze("O'Brian, Henny");
/// profile.analyze("Dale, Danny");
/// profile.analyze("Rickets, Ronney");
///
/// profile.pre_generate();
///
/// assert_eq!(profile.save("./tests/samples/sample-00-profile").unwrap(), true);
/// }
///
pub fn save(&mut self, path: &'static str) -> Result<bool, io::Error> {
let dsp_json = serde_json::to_string(&self).unwrap();
// Create the archive file
let mut file = match File::create(format!("{}.json", &path)) {
Err(e) => {
error!("Could not create file {:?}", &path.to_string());
return Err(e);
}
Ok(f) => {
info!("Successfully exported to {:?}", &path.to_string());
f
}
};
// Write the json string to file, returns io::Result<()>
match file.write_all(dsp_json.as_bytes()) {
Err(e) => {
error!("Could not write to file {}", &path.to_string());
return Err(e);
}
Ok(_) => {
info!("Successfully exported to {}", &path.to_string());
}
};
Ok(true)
}
/// This function converts the Profile to a serialize JSON string.
///
/// #Example
///
/// ```rust
/// extern crate test_data_generation;
///
/// use test_data_generation::Profile;
///
/// fn main() {
/// // analyze the dataset
/// let mut data_profile = Profile::new();
///
/// // analyze the dataset
/// data_profile.analyze("OK");
///
/// println!("{}", data_profile.serialize());
/// // {"patterns":{"VC":1},"pattern_total":1,"pattern_keys":["VC"],"pattern_vals":[1],"pattern_percentages":[],"pattern_ranks":[],"sizes":{"2":1},"size_total":1,"size_ranks":[],"processors":4,"facts":[[{"key":"O","prior_key":null,"next_key":"K","pattern_placeholder":"V","starts_with":1,"ends_with":0,"index_offset":0}],[{"key":"K","prior_key":"O","next_key":null,"pattern_placeholder":"C","starts_with":0,"ends_with":1,"index_offset":1}],[],[]]}
/// }
///
pub fn serialize(&mut self) -> String {
serde_json::to_string(&self).unwrap()
}
}
#[macro_use]
pub mod macros;
pub mod configs;
pub mod data_sample_parser;
pub mod engine;
pub mod shared;
// Unit Tests
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn apply_facts() {
let mut profile = Profile::new();
let results = PatternDefinition::new().analyze("Word");
assert_eq!(profile.apply_facts(results.0, results.1).unwrap(), 1);
}
#[test]
fn levenshtein_test() {
let mut profil = Profile::new();
assert_eq!(
profil.levenshtein_distance(&"kitten".to_string(), &"sitting".to_string()),
3 as usize
);
}
#[test]
fn realistic_data_test() {
let mut profil = Profile::new();
assert_eq!(
profil.realistic_test(&"kitten".to_string(), &"sitting".to_string()),
76.92307692307692 as f64
);
}
#[test]
fn learn_from_entity() {
let mut profil = Profile::new();
let sample_data = vec![
"Smith, John".to_string(),
"Doe, John".to_string(),
"Dale, Danny".to_string(),
"Rickets, Ronney".to_string(),
];
for sample in sample_data.iter().clone() {
profil.analyze(&sample);
}
profil.pre_generate();
let learning = profil.learn_from_entity(sample_data).unwrap();
assert_eq!(learning, true);
}
#[test]
fn logging_test() {
let mut profile = Profile::new();
profile.reset_analyze();
assert!(true);
}
#[test]
fn new_profile_with_id() {
let mut profile = Profile::new_with_id("12345".to_string());
profile.pre_generate();
assert_eq!(profile.id.unwrap(), "12345".to_string());
}
#[test]
fn new_profile_from_file() {
let mut profile = Profile::from_file("./tests/samples/sample-00-profile");
profile.pre_generate();
assert!(profile.generate().len() > 0);
}
#[test]
#[should_panic]
fn new_profile_from_file_bad_data() {
let mut profile = Profile::from_file("./tests/samples/not-readable");
profile.pre_generate();
assert!(profile.generate().len() > 0);
}
#[test]
#[should_panic(expected = "Could not open file \"./tests/samples/bad-path\"")]
fn new_profile_from_file_bad_path() {
let mut profile = Profile::from_file("./tests/samples/bad-path");
profile.pre_generate();
assert!(profile.generate().len() > 0);
}
#[test]
fn new_profile_from_serialized() {
let serialized = "{\"patterns\":{\"VC\":1},\"pattern_total\":1,\"pattern_keys\":[\"VC\"],\"pattern_vals\":[1],\"pattern_percentages\":[],\"pattern_ranks\":[],\"sizes\":{\"2\":1},\"size_total\":1,\"size_ranks\":[],\"processors\":4,\"facts\":[[{\"key\":\"O\",\"prior_key\":null,\"next_key\":\"K\",\"pattern_placeholder\":\"V\",\"starts_with\":1,\"ends_with\":0,\"index_offset\":0}],[{\"key\":\"K\",\"prior_key\":\"O\",\"next_key\":null,\"pattern_placeholder\":\"C\",\"starts_with\":0,\"ends_with\":1,\"index_offset\":1}],[],[]]}";
let mut profile = Profile::from_serialized(&serialized);
profile.pre_generate();
assert_eq!(profile.generate(), "OK");
}
#[test]
fn new_profile_new_with() {
let profile = Profile::new_with_processors(10);
assert_eq!(profile.processors, 10);
}
#[test]
// ensure Profile is analyzing all the sample data points
fn profile_analyze() {
let mut profil = Profile::new();
profil.analyze("Smith, John");
profil.analyze("O'Brian, Henny");
profil.analyze("Dale, Danny");
profil.analyze("Rickets, Ronney");
assert_eq!(profil.patterns.len(), 4);
}
#[test]
// ensure Profile is able to find the facts that relate to a pattern
// NOTE: Dates need work! e.g.: 00/15/0027
fn profile_generate_from_pattern_date() {
let mut profil = Profile::new();
profil.analyze("01/13/2017");
profil.analyze("11/24/2017");
profil.analyze("08/05/2017");
profil.pre_generate();
let generated = profil.generate_from_pattern("##p##p####".to_string());
assert_eq!(10, generated.len());
}
#[test]
// ensure Profile is able to find the facts that relate to a pattern
fn profile_generate_from_pattern_string() {
let mut profil = Profile::new();
profil.analyze("First");
profil.analyze("Next");
profil.analyze("Last");
profil.pre_generate();
let generated = profil.generate_from_pattern("Cvcc".to_string());
assert_eq!(4, generated.len());
}
#[test]
// ensure Profile is generating correct test data
fn profile_generate() {
let mut profil = Profile::new();
profil.analyze("Smith, John");
profil.analyze("O'Brian, Henny");
profil.analyze("Dale, Danny");
profil.analyze("Rickets, Ronnae");
profil.analyze("Richard, Richie");
profil.analyze("Roberts, Blake");
profil.analyze("Conways, Sephen");
profil.pre_generate();
assert!(profil.generate().len() > 10);
}
#[test]
// issue #31
// ensure Profile doesn't generate a name with a backslash preceding an apostrophe
fn profile_generate_with_apostrophe() {
let mut profil = Profile::new();
profil.analyze("O'Brien");
profil.pre_generate();
let generated = profil.generate();
assert_eq!(generated, "O'Brien");
}
#[test]
// ensure Profile is providing the correct pattern ranks after analyzing the sample data
fn profile_pregenerate_patterns() {
let mut profil = Profile::new();
profil.analyze("Smith, John");
profil.analyze("O'Brian, Henny");
profil.analyze("Dale, Danny");
profil.analyze("Rickets, Ronnae");
profil.analyze("Richard, Richie");
profil.analyze("Roberts, Blake");
profil.analyze("Conways, Sephen");
profil.pre_generate();
let test = [
("CvccvccpSCvccvv".to_string(), 28.57142857142857 as f64),
("CcvccpSCvcc".to_string(), 42.857142857142854 as f64),
("CvccvccpSCvccvc".to_string(), 57.14285714285714 as f64),
("CvcvcccpSCcvcv".to_string(), 71.42857142857142 as f64),
("CvcvpSCvccc".to_string(), 85.7142857142857 as f64),
("V@CcvvcpSCvccc".to_string(), 99.99999999999997 as f64),
];
assert_eq!(profil.pattern_ranks, test);
}
#[test]
// ensure Profile is providing the correct pattern ranks after analyzing the sample data
fn profile_pregenerate_sizes() {
let mut profil = Profile::new();
profil.analyze("Smith, Johny"); //12
profil.analyze("O'Brian, Hen"); //12
profil.analyze("Dale, Danny"); //11
profil.analyze("O'Henry, Al"); //11
profil.analyze("Rickets, Ro"); //11
profil.analyze("Mr. Wilbers"); //11
profil.analyze("Po, Al"); //6
profil.pre_generate();
let test = [
(11, 57.14285714285714),
(12, 85.71428571428571),
(6, 100 as f64),
];
assert_eq!(profil.size_ranks, test);
}
#[test]
fn save_profile() {
let mut profile = Profile::new();
profile.analyze("Smith, John");
profile.analyze("O'Brian, Henny");
profile.analyze("Dale, Danny");
profile.analyze("Rickets, Ronney");
profile.pre_generate();
assert_eq!(
profile.save("./tests/samples/sample-00-profile").unwrap(),
true
);
}
#[test]
// ensure a Profile can be exported (to be archived) as JSON
fn serialize() {
let mut profil = Profile::new();
// analyze the dataset
profil.analyze("OK");
let serialized = profil.serialize();
assert_eq!(serialized, "{\"id\":null,\"patterns\":{\"VC\":1},\"pattern_total\":1,\"pattern_keys\":[\"VC\"],\"pattern_vals\":[1],\"pattern_percentages\":[],\"pattern_ranks\":[],\"sizes\":{\"2\":1},\"size_total\":1,\"size_ranks\":[],\"processors\":4,\"facts\":[[{\"key\":\"O\",\"prior_key\":null,\"next_key\":\"K\",\"pattern_placeholder\":\"V\",\"starts_with\":1,\"ends_with\":0,\"index_offset\":0}],[{\"key\":\"K\",\"prior_key\":\"O\",\"next_key\":null,\"pattern_placeholder\":\"C\",\"starts_with\":0,\"ends_with\":1,\"index_offset\":1}],[],[]]}");
}
}