test_data_generation/
data_sample_parser.rs

1//! The `data_sample_parser` module provides functionality to read sample data, parse and analyze it,
2//! so that test data can be generated based on profiles.
3//!
4//! # Examples
5//!
6//!
7//! Generate some demo test data ...
8//!
9//! ```
10//! extern crate test_data_generation;
11//!
12//! use test_data_generation::data_sample_parser::DataSampleParser;
13//!
14//! fn main() {
15//!		// initalize a new DataSampelParser
16//!		let dsp = DataSampleParser::new();
17//!
18//!		// generate some test data using the demo functions
19//!		println!("generate date:{}", dsp.demo_date());
20//!		println!("generate person:{}", dsp.demo_person_name());
21//! }
22//! ```
23//!
24//! Save the algorithm ...
25//!
26//! Archive (export) the data sample parser object so that you can reuse the algorithm to generate test data at a later time.
27//! This enables you to persist the algorithm without having to store the actual data sample that was used to create the algorithm -
28//! Which is important if you used 'real' data in your sample data.
29//!
30//! ```
31//! extern crate test_data_generation;
32//!
33//! use test_data_generation::data_sample_parser::DataSampleParser;
34//!
35//! fn main() {
36//! 	// analyze the dataset
37//!		let mut dsp =  DataSampleParser::new();
38//!
39//!     assert_eq!(dsp.save(&String::from("./tests/samples/empty-dsp")).unwrap(), true);
40//! }
41//! ```
42//!
43//! Load an algorithm ...
44//!
45//! Create a data sample parser from a previously saved (exported) archive file so you can generate test data based on the algorithm.</br>
46//! *NOTE:* In this example, there was only one data point in the data sample that was analyzed (the word 'OK'). This was intentional
47//! so the algorithm would be guaranteed to generate that same word. This was done ensure the assert_eq! returns true.
48//!
49//! ```
50//! extern crate test_data_generation;
51//!
52//! use test_data_generation::data_sample_parser::DataSampleParser;
53//!
54//! fn main() {
55//!		let mut dsp = DataSampleParser::from_file(&String::from("./tests/samples/sample-00-dsp"));
56//!
57//!		assert_eq!(dsp.generate_record()[0], "OK".to_string());
58//! }
59//! ```
60//!
61//! You can also generate a new csv file based on the data sample provided.
62//!
63//! ```
64//! extern crate test_data_generation;
65//!
66//! use test_data_generation::data_sample_parser::DataSampleParser;
67//!
68//! fn main() {
69//!     let mut dsp =  DataSampleParser::new();
70//!
71//!     // Using the default delimiter (comma)
72//!    	dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None).unwrap();
73//!    	dsp.generate_csv(100, &String::from("./tests/samples/generated-01.csv"), None).unwrap();
74//! }
75//! ```
76//!
77
78// use std::collections::BTreeMap;
79use crate::configs::Configs;
80use crate::engine::{Engine, EngineContainer};
81use crate::shared::CsvManipulator;
82use crate::Profile;
83use csv;
84use indexmap::IndexMap;
85use std::fs::File;
86use std::io;
87use std::io::prelude::*;
88use std::io::Write;
89use std::result::Result;
90//use csv::StringRecord;
91use csv::WriterBuilder;
92use serde_json;
93use serde_json::Value;
94use std::error::Error;
95
96use std::sync::mpsc;
97use std::sync::mpsc::{Receiver, Sender};
98use std::thread;
99
100const DELIMITER: u8 = b',';
101
102type ProfilesMap = IndexMap<String, Profile>;
103
104#[derive(Serialize, Deserialize, Debug)]
105/// Represents the Parser for sample data to be used
106pub struct DataSampleParser {
107    /// indicates if there were issues parsing and anlyzing the data sample
108    pub issues: bool,
109    /// Configs object that define the configuration settings
110    cfg: Option<Configs>,
111    /// List of Profiles objects identified by a unique profile name LinkedHashMap<String, Profile>
112    #[serde(with = "indexmap::serde_seq")]
113    profiles: ProfilesMap,
114}
115
116impl CsvManipulator for DataSampleParser {}
117impl Engine for DataSampleParser {}
118
119impl DataSampleParser {
120    /// Constructs a new DataSampleParser
121    ///
122    /// #Example
123    ///
124    /// ```
125    /// extern crate test_data_generation;
126    ///
127    /// use test_data_generation::data_sample_parser::DataSampleParser;
128    ///
129    /// fn main() {
130    ///		// initalize a new DataSampelParser
131    ///		let dsp = DataSampleParser::new();
132    /// }
133    /// ```
134    pub fn new() -> DataSampleParser {
135        DataSampleParser {
136            issues: false,
137            cfg: None,
138            profiles: ProfilesMap::new(),
139        }
140    }
141
142    /// Constructs a new DataSampleParser
143    ///
144    /// # Arguments
145    ///
146    /// * `path: &String - The full path name (including the file name and extension) to the configuration file.</br>
147    ///
148    /// #Example
149    ///
150    /// ```
151    /// extern crate test_data_generation;
152    ///
153    /// use test_data_generation::data_sample_parser::DataSampleParser;
154    ///
155    /// fn main() {
156    ///		// initalize a new DataSampelParser
157    ///	    // param: the path to the configuration  file
158    ///		let dsp = DataSampleParser::new_with(&String::from("./config/tdg.yaml"));
159    /// }
160    /// ```
161    pub fn new_with(path: &String) -> DataSampleParser {
162        DataSampleParser {
163            issues: false,
164            cfg: Some(Configs::new(path)),
165            profiles: ProfilesMap::new(),
166        }
167    }
168
169    /// Constructs a new DataSampleParser from an exported JSON file. This is used when restoring from "archive"
170    ///
171    /// # Arguments
172    ///
173    /// * `path: &String` - The full path name of the json formatted Data Sample Parser archive file.</br>
174    ///
175    /// #Example
176    ///
177    /// ```
178    /// extern crate test_data_generation;
179    ///
180    /// use test_data_generation::data_sample_parser::DataSampleParser;
181    ///
182    /// fn main() {
183    ///		let mut dsp = DataSampleParser::from_file(&String::from("./tests/samples/sample-00-dsp"));
184    ///
185    ///		assert_eq!(dsp.generate_record()[0], "OK".to_string());
186    /// }
187    /// ```
188    pub fn from_file(path: &String) -> DataSampleParser {
189        // open the archive file
190        let mut file = match File::open(format!("{}.json", &path)) {
191            Err(_e) => {
192                error!("Could not open file {:?}", &path.to_string());
193                panic!("Could not open file {:?}", &path.to_string());
194            }
195            Ok(f) => {
196                info!("Successfully opened file {:?}", &path.to_string());
197                f
198            }
199        };
200
201        //read the archive file
202        let mut serialized = String::new();
203        match file.read_to_string(&mut serialized) {
204            Err(e) => {
205                error!(
206                    "Could not read file {:?} because of {:?}",
207                    &path.to_string(),
208                    e.to_string()
209                );
210                panic!(
211                    "Could not read file {:?} because of {:?}",
212                    &path.to_string(),
213                    e.to_string()
214                );
215            }
216            Ok(s) => {
217                info!("Successfully read file {:?}", &path.to_string());
218                s
219            }
220        };
221
222        // Support backwards compatibility for DSP saved using prior versions
223        let dsp: Value = serde_json::from_str(&serialized).unwrap();
224        let prfils = dsp.get("profiles").unwrap();
225
226        match prfils.is_array() {
227            true => {
228                debug!("Version 0.3.0 detected. Using latest version");
229                return serde_json::from_str(&serialized).unwrap();
230            }
231            false => {
232                info!("Prior version 0.2.1 detected. Trying to upgrade to latest version");
233
234                return Self::upgrade_to_latest_version(serialized);
235            }
236        }
237    }
238
239    fn upgrade_to_latest_version(serialized: String) -> DataSampleParser {
240        let dsp: Value = serde_json::from_str(&serialized).unwrap();
241        let prfils = dsp.get("profiles").unwrap();
242        let mut pm: ProfilesMap = ProfilesMap::new();
243        let issues = dsp.get("issues").unwrap().as_bool().unwrap();
244
245        for prf in prfils.as_object().iter() {
246            for attr in prf.keys() {
247                let id = prf
248                    .get(attr)
249                    .unwrap()
250                    .as_object()
251                    .unwrap()
252                    .get("id")
253                    .unwrap()
254                    .as_str()
255                    .unwrap()
256                    .to_string();
257                let serl = &serde_json::to_string(prf.get(attr).unwrap()).unwrap();
258                println!("{:?} : {:?}", id, serl);
259                pm.insert(id, Profile::from_serialized(serl));
260            }
261        }
262
263        let mut rtn = match dsp.get("cfg").unwrap() {
264            serde_json::Value::Null => DataSampleParser::new(),
265            _ => DataSampleParser::new_with(
266                &dsp.get("cfg")
267                    .unwrap()
268                    .as_object()
269                    .unwrap()
270                    .get("file")
271                    .unwrap()
272                    .as_str()
273                    .unwrap()
274                    .to_string(),
275            ),
276        };
277
278        rtn.issues = issues;
279        rtn.profiles = pm;
280        return rtn;
281    }
282
283    #[inline]
284    fn analyze_columns(&mut self, profile_keys: Vec<String>, columns: Vec<Vec<String>>) {
285        let col_cnt = columns.len();
286        let (tx, rx): (
287            Sender<Result<Profile, String>>,
288            Receiver<Result<Profile, String>>,
289        ) = mpsc::channel();
290        let mut jobs = Vec::new();
291
292        //iterate through all the columns
293        for (idx, column) in columns.iter().enumerate() {
294            let thread_tx = tx.clone();
295            let container = EngineContainer {
296                profile: self.profiles.get(&profile_keys[idx]).unwrap().clone(),
297                entities: column.to_vec(),
298            };
299
300            let job = thread::spawn(move || {
301                let result = Self::profile_entities_with_container(container);
302                thread_tx.send(result).unwrap();
303            });
304
305            jobs.push(job);
306        }
307
308        let mut results = Vec::with_capacity(col_cnt);
309        for _ in 0..col_cnt {
310            results.push(rx.recv());
311        }
312
313        for job in jobs {
314            job.join().expect("Error: Could not run the job");
315        }
316
317        for result in results {
318            match result {
319                Ok(msg) => {
320                    //received from sender
321                    match msg {
322                        Ok(p) => {
323                            let id = p.clone().id.unwrap();
324                            debug!("Profile {} has finished analyzing the entities.", id);
325                            self.profiles.insert(id, p);
326                        }
327                        Err(e) => {
328                            error!(
329                                "Profile wasn't able to analyzing the entities. Error: {}",
330                                e
331                            );
332                        }
333                    }
334                }
335                Err(e) => {
336                    // could not receive from sender
337                    error!("Receiver wasn't able to receive message from sender which was analyzing entities for the profile. Error: {}", e);
338                    panic!("Receiver wasn't able to receive message from sender which was analyzing entities for the profile. Error: {}", e);
339                }
340            }
341        }
342        // Multi-Threading END
343    }
344
345    /// This function analyzes sample data that is a csv formatted string and returns a boolean if successful.
346    /// _NOTE:_ The csv properties are as follows:
347    ///       + headers are included as first line
348    ///       + double quote wrap text
349    ///       + double quote escapes is enabled
350    ///       + delimiter is a comma
351    ///
352    ///
353    /// # Arguments
354    ///
355    /// * `data: &String` - The textual content of a csv formatted sample data file.</br>
356    /// * `delimiter: Option<u8>` - The delimiter to use, otherwise use the default.</br>
357    ///
358    /// # Example
359    ///
360    /// ```
361    /// extern crate test_data_generation;
362    ///
363    /// use test_data_generation::data_sample_parser::DataSampleParser;
364    ///
365    /// fn main() {
366    ///		// initalize a new DataSampelParser
367    ///		let mut dsp = DataSampleParser::new();
368    ///		let mut data = String::from("");
369    ///		data.push_str("\"firstname\",\"lastname\"\n");
370    ///		data.push_str("\"Aaron\",\"Aaberg\"\n");
371    ///		data.push_str("\"Aaron\",\"Aaby\"\n");
372    ///		data.push_str("\"Abbey\",\"Aadland\"\n");
373    ///		data.push_str("\"Abbie\",\"Aagaard\"\n");
374    ///		data.push_str("\"Abby\",\"Aakre\"");
375    ///
376    ///     // Use the default delimiter (comma)
377    /// 	assert_eq!(dsp.analyze_csv_data(&data, None).unwrap(),1);
378    /// }
379    /// ```
380    pub fn analyze_csv_data(
381        &mut self,
382        data: &String,
383        delimiter: Option<u8>,
384    ) -> Result<i32, String> {
385        debug!("Starting to analyzed the csv data {}", data);
386
387        let mut rdr = csv::ReaderBuilder::new()
388            .has_headers(true)
389            .quote(b'"')
390            .double_quote(true)
391            .delimiter(Self::else_default_delimiter(delimiter))
392            .from_reader(data.as_bytes());
393
394        //iterate through the headers
395        for headers in rdr.headers() {
396            for header in headers.iter() {
397                //add a Profile to the list of profiles to represent the field (indexed using the header label)
398                let p = Profile::new_with_id(header.to_string());
399                self.profiles.insert(header.to_string(), p);
400            }
401        }
402
403        //create a Vec from all the keys (headers) in the profiles list
404        let profile_keys: Vec<_> = self.profiles.keys().cloned().collect();
405
406        debug!("CSV headers: {:?}", profile_keys);
407
408        // Multi-Threading START
409        let columns = Self::read_as_columns(rdr);
410        //let col_cnt = columns.len();
411        let rec_cnt = columns[0].len();
412        self.analyze_columns(profile_keys, columns);
413
414        debug!("Successfully analyzed the csv data");
415        debug!(
416            "Analyzed {} records, {} fields",
417            rec_cnt,
418            self.profiles.len()
419        );
420
421        //prepare the profiles for data generation
422        self.profiles.iter_mut().for_each(|p| p.1.pre_generate());
423
424        Ok(1)
425    }
426
427    /// This function analyzes sample data that is a csv formatted file and returns a boolean if successful.
428    /// _NOTE:_ The csv properties are as follows:
429    ///       + headers are included as first line
430    ///       + double quote wrap text
431    ///       + double quote escapes is enabled
432    ///       + delimiter is a comma
433    ///
434    ///
435    /// # Arguments
436    ///
437    /// * `path: &String` - The full path name of the csv formatted sample data file.</br>
438    /// * `delimiter: Option<u8>` - The delimiter to use, otherwise use the default.</br>
439    ///
440    /// # Example
441    ///
442    /// ```
443    /// extern crate test_data_generation;
444    ///
445    /// use test_data_generation::data_sample_parser::DataSampleParser;
446    ///
447    /// fn main() {
448    ///		// initalize a new DataSampelParser
449    ///		let mut dsp = DataSampleParser::new();
450    ///
451    ///     // Use the default delimiter (comma)
452    /// 	assert_eq!(dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None).unwrap(),1);
453    /// }
454    /// ```
455    pub fn analyze_csv_file(
456        &mut self,
457        path: &String,
458        delimiter: Option<u8>,
459    ) -> Result<i32, String> {
460        info!("Starting to analyzed the csv file {}", path);
461
462        let mut file = (File::open(path).map_err(|e| {
463            error!("csv file {} couldn't be opened!", path);
464            e.to_string()
465        }))?;
466
467        let mut data = String::new();
468        file.read_to_string(&mut data)
469            .map_err(|e| {
470                error!("csv file {} couldn't be read!", path);
471                e.to_string()
472            })
473            .unwrap();
474
475        self.analyze_csv_data(&data, delimiter)
476    }
477
478    /// This function generates date as strings using the a `demo` profile
479    ///
480    /// # Example
481    ///
482    /// ```
483    /// extern crate test_data_generation;
484    ///
485    /// use test_data_generation::data_sample_parser::DataSampleParser;
486    ///
487    /// fn main() {
488    ///		// initalize a new DataSampelParser
489    ///		let dsp = DataSampleParser::new();
490    ///
491    ///		// generate some test data using the demo functions
492    ///		println!("generate date:{}", dsp.demo_date());
493    /// }
494    /// ```
495    pub fn demo_date(&self) -> String {
496        let mut profil = Profile::new();
497
498        profil.analyze("01/04/2017");
499        profil.analyze("02/09/2017");
500        profil.analyze("03/13/2017");
501        profil.analyze("04/17/2017");
502        profil.analyze("05/22/2017");
503        profil.analyze("07/26/2017");
504        profil.analyze("08/30/2017");
505        profil.analyze("09/07/2017");
506        profil.analyze("10/11/2017");
507        profil.analyze("11/15/2017");
508        profil.analyze("12/21/2017");
509        profil.analyze("01/14/2016");
510        profil.analyze("02/19/2016");
511        profil.analyze("03/23/2016");
512        profil.analyze("04/27/2016");
513        profil.analyze("05/02/2016");
514        profil.analyze("07/16/2015");
515        profil.analyze("08/20/2015");
516        profil.analyze("09/17/2015");
517        profil.analyze("10/01/2014");
518        profil.analyze("11/25/2014");
519        profil.analyze("12/31/2018");
520
521        profil.pre_generate();
522        //profil.apply_facts("##p##p####".to_string())
523        profil.generate()
524    }
525
526    /// This function generates people's names as strings using the a `demo` profile
527    ///
528    /// # Example
529    ///
530    /// ```
531    /// extern crate test_data_generation;
532    ///
533    /// use test_data_generation::data_sample_parser::DataSampleParser;
534    ///
535    /// fn main() {
536    ///		// initalize a new DataSampelParser
537    ///		let dsp = DataSampleParser::new();
538    ///
539    ///		// generate some test data using the demo functions
540    ///		println!("generate date:{}", dsp.demo_person_name());
541    /// }
542    pub fn demo_person_name(&self) -> String {
543        let mut profil = Profile::new();
544
545        profil.analyze("Smith, John");
546        profil.analyze("O'Brien, Henny");
547        profil.analyze("Dale, Danny");
548        profil.analyze("Rickets, Ronnae");
549        profil.analyze("Richard, Richie");
550        profil.analyze("Roberts, Blake");
551        profil.analyze("Conways, Sephen");
552
553        profil.pre_generate();
554        profil.generate()
555    }
556
557    fn else_default_delimiter(delimiter: Option<u8>) -> u8 {
558        match delimiter {
559            Some(d) => {
560                return d;
561            }
562            None => {
563                return DELIMITER;
564            }
565        }
566    }
567
568    /// This function returns a vector of header names
569    ///
570    /// # Example
571    ///
572    /// ```
573    /// extern crate test_data_generation;
574    ///
575    /// use test_data_generation::data_sample_parser::DataSampleParser;
576    ///
577    /// fn main() {
578    ///		// initalize a new DataSampelParser
579    ///		let mut dsp = DataSampleParser::new();
580    ///
581    /// 	dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None).unwrap();
582    ///     let headers = dsp.extract_headers();
583    ///
584    ///		assert_eq!(headers.len(), 2);
585    /// }
586    pub fn extract_headers(&mut self) -> Vec<String> {
587        let mut headers = vec![];
588
589        for profile in self.profiles.iter_mut() {
590            headers.push(profile.0.to_string());
591        }
592
593        headers
594    }
595
596    /// This function generates test data for the specified field name.
597    ///
598    /// # Arguments
599    ///
600    /// * `field: String` - The name of the field (e.g.: firstname) the represents the profile to use when generating the test data.</br>
601    ///
602    /// # Example
603    ///
604    /// ```
605    /// extern crate test_data_generation;
606    ///
607    /// use test_data_generation::data_sample_parser::DataSampleParser;
608    ///
609    /// fn main() {
610    ///		// initalize a new DataSampelParser
611    ///		let mut dsp = DataSampleParser::new();
612    ///
613    /// 	dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None).unwrap();
614    ///     println!("Generated data for first name {}",dsp.generate_by_field_name("firstname".to_string()));
615    /// }
616    /// ```
617    pub fn generate_by_field_name(&mut self, field: String) -> String {
618        self.profiles
619            .get_mut(&field)
620            .unwrap()
621            .generate()
622            .to_string()
623    }
624
625    /// This function Vec of generates test data fields.
626    ///
627    /// # Example
628    ///
629    /// ```
630    /// extern crate test_data_generation;
631    ///
632    /// use test_data_generation::data_sample_parser::DataSampleParser;
633    ///
634    /// fn main() {
635    ///		// initalize a new DataSampelParser
636    ///		let mut dsp = DataSampleParser::new();
637    ///
638    /// 	dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None).unwrap();
639    ///     println!("Generated data record: {:?}",dsp.generate_record());
640    /// }
641    /// ```
642    pub fn generate_record(&mut self) -> Vec<String> {
643        let mut record = Vec::new();
644
645        for profile in self.profiles.iter_mut() {
646            record.push(profile.1.generate().to_string());
647        }
648
649        record
650    }
651
652    /// This function creates a csv file of generated test data.
653    /// Prior to calling this funciton, you need to call the analyze_csv_file() function.
654    /// _NOTE:_ The csv properties are as follows:
655    ///       + headers are included as first line
656    ///       + double quotes wrap text
657    ///       + double quote escapes is enabled
658    ///       + delimiter is a comma
659    ///
660    ///
661    /// # Arguments
662    ///
663    /// * `row_count: u32` - The number of rows to generate.</br>
664    /// * `path: &String` - The full path name where to save the csv file.</br>
665    /// * `delimiter: Option<u8>` - The delimiter to use, otherwise use the default.</br>
666    ///
667    /// # Example
668    ///
669    /// ```
670    /// extern crate test_data_generation;
671    ///
672    /// use test_data_generation::data_sample_parser::DataSampleParser;
673    ///
674    /// fn main() {
675    ///		// initalize a new DataSampelParser
676    ///		let mut dsp = DataSampleParser::new();
677    ///
678    /// 	dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None).unwrap();
679    ///     dsp.generate_csv(100, &String::from("./tests/samples/generated-01.csv"), None).unwrap();
680    /// }
681    /// ```
682    pub fn generate_csv(
683        &mut self,
684        row_count: u32,
685        path: &String,
686        delimiter: Option<u8>,
687    ) -> Result<(), Box<dyn Error>> {
688        info!("generating csv file {}", path);
689
690        let mut wtr = (WriterBuilder::new()
691            .has_headers(true)
692            .quote(b'"')
693            .double_quote(true)
694            .delimiter(Self::else_default_delimiter(delimiter))
695            .from_path(path)
696            .map_err(|e| {
697                error!("csv file {} couldn't be created!", path);
698                e.to_string()
699            }))?;
700
701        let headers = self.extract_headers();
702        wtr.write_record(&headers)?;
703
704        for _r in 0..row_count {
705            let mut record = Vec::new();
706
707            for profile in self.profiles.iter_mut() {
708                record.push(profile.1.generate());
709            }
710
711            wtr.write_record(&record)?;
712        }
713
714        wtr.flush()?;
715
716        Ok(())
717    }
718
719    /// This function calculates the levenshtein distance between 2 strings.
720    /// See: https://crates.io/crates/levenshtein
721    ///
722    /// # Arguments
723    ///
724    /// * `control: &String` - The string to compare against. This would be the real data from the data sample.</br>
725    /// * `experiment: &String` - The string to compare. This would be the generated data for which you want to find the distance.</br>
726    ///
727    /// #Example
728    ///
729    /// ```
730    /// extern crate test_data_generation;
731    ///
732    /// use test_data_generation::data_sample_parser::DataSampleParser;
733    ///
734    /// fn main() {
735    /// 	// analyze the dataset
736    ///		let mut dsp =  DataSampleParser::new();
737    ///
738    ///     assert_eq!(dsp.levenshtein_distance(&"kitten".to_string(), &"sitting".to_string()), 3 as usize);
739    /// }
740    ///
741    pub fn levenshtein_distance(&mut self, control: &String, experiment: &String) -> usize {
742        // https://docs.rs/levenshtein/1.0.3/levenshtein/fn.levenshtein.html
743        levenshtein_distance!(control, experiment)
744    }
745
746    /// This function calculates the percent difference between 2 strings.
747    ///
748    /// # Arguments
749    ///
750    /// * `control: &String` - The string to compare against. This would be the real data from the data sample.</br>
751    /// * `experiment: &String` - The string to compare. This would be the generated data for which you want to find the percent difference.</br>
752    ///
753    /// #Example
754    ///
755    /// ```
756    /// extern crate test_data_generation;
757    ///
758    /// use test_data_generation::data_sample_parser::DataSampleParser;
759    ///
760    /// fn main() {
761    /// 	// analyze the dataset
762    ///		let mut dsp =  DataSampleParser::new();
763    ///
764    ///     assert_eq!(dsp.realistic_test(&"kitten".to_string(), &"sitting".to_string()), 76.92307692307692 as f64);
765    /// }
766    ///
767    pub fn realistic_test(&mut self, control: &String, experiment: &String) -> f64 {
768        //https://docs.rs/GSL/0.4.31/rgsl/statistics/fn.correlation.html
769        //http://www.statisticshowto.com/probability-and-statistics/correlation-coefficient-formula/
770        // pearson's chi square test
771        // cosine similarity - http://blog.christianperone.com/2013/09/machine-learning-cosine-similarity-for-vector-space-models-part-iii/
772        realistic_test!(control, experiment)
773    }
774
775    /// This function returns a boolean that indicates if the data sample parsing had issues
776    ///
777    /// # Example
778    ///
779    /// ```
780    /// extern crate test_data_generation;
781    ///
782    /// use test_data_generation::data_sample_parser::DataSampleParser;
783    ///
784    /// fn main() {
785    ///		// initalize a new DataSampelParser
786    ///	    // param: the path to the configuration file is wrong
787    ///		let dsp = DataSampleParser::new_with(&String::from("./target/debug/config/tdg.yaml"));
788    ///
789    ///		// generate some test data using the demo functions
790    ///		assert_eq!(dsp.running_with_issues(), &false);
791    /// }
792    pub fn running_with_issues(&self) -> &bool {
793        &self.issues
794    }
795
796    /// This function saves (exports) the DataSampleParser to a JSON file.
797    /// This is useful when you wish to reuse the algorithm to generate more test data later.
798    ///
799    /// # Arguments
800    ///
801    /// * `field: &String` - The full path of the export file , excluding the file extension, (e.g.: "./test/data/custom-names").</br>
802    ///
803    /// #Errors
804    /// If this function encounters any form of I/O or other error, an error variant will be returned.
805    /// Otherwise, the function returns Ok(true).</br>
806    ///
807    /// #Example
808    ///
809    /// ```
810    /// extern crate test_data_generation;
811    ///
812    /// use test_data_generation::data_sample_parser::DataSampleParser;
813    ///
814    /// fn main() {
815    /// 	// analyze the dataset
816    ///		let mut dsp =  DataSampleParser::new();
817    ///     dsp.analyze_csv_file(&String::from("./tests/samples/sample-00.csv"), None).unwrap();
818    ///
819    ///     assert_eq!(dsp.save(&String::from("./tests/samples/sample-00-dsp")).unwrap(), true);
820    /// }
821    ///
822    pub fn save(&mut self, path: &String) -> Result<bool, io::Error> {
823        let dsp_json = serde_json::to_string(&self).unwrap();
824
825        // Create the archive file
826        let mut file = match File::create(format!("{}.json", &path)) {
827            Err(e) => {
828                error!("Could not create file {:?}", &path.to_string());
829                return Err(e);
830            }
831            Ok(f) => {
832                info!("Successfully exported to {:?}", &path.to_string());
833                f
834            }
835        };
836
837        // Write the json string to file, returns io::Result<()>
838        match file.write_all(dsp_json.as_bytes()) {
839            Err(e) => {
840                error!("Could not write to file {}", &path.to_string());
841                return Err(e);
842            }
843            Ok(_) => {
844                info!("Successfully exported to {}", &path.to_string());
845            }
846        };
847
848        Ok(true)
849    }
850}
851
852#[cfg(test)]
853mod tests {
854    use super::*;
855    use std::fs::File;
856    use std::io::BufReader;
857
858    #[test]
859    // ensure a new Data Sample Parser can be created
860    fn test_new() {
861        let _dsp = DataSampleParser::new();
862
863        assert!(true);
864    }
865
866    #[test]
867    // ensure a new Data Sample Parser can be created with configurations
868    fn test_new_with() {
869        let _dsp = DataSampleParser::new_with(&String::from("./config/tdg.yaml"));
870
871        assert!(true);
872    }
873
874    #[test]
875    // ensure the Data Sample Parser can be restored from archived file
876    fn test_from_file() {
877        let mut dsp = DataSampleParser::from_file(&String::from("./tests/samples/sample-00-dsp"));
878        println!("Sample data is [{:?}]", dsp.generate_record()[0]);
879
880        assert_eq!(dsp.generate_record()[0], "OK".to_string());
881    }
882
883    #[test]
884    // ensure the Data Sample Parser can be restored from archived file that
885    // was saved using version 0.2.1 using a configuration
886    fn test_from_file_v021_with_cfg() {
887        let mut dsp =
888            DataSampleParser::from_file(&String::from("./tests/samples/sample-0.2.1-dsp"));
889        println!("Sample data is [{:?}]", dsp.generate_record()[0]);
890
891        assert_eq!(dsp.generate_record()[0], "OK".to_string());
892    }
893
894    #[test]
895    // ensure the Data Sample Parser can be restored from archived file that
896    // was saved using version 0.2.1 without a configuration
897    fn test_from_file_v021_no_cfg() {
898        let mut dsp =
899            DataSampleParser::from_file(&String::from("./tests/samples/sample-0.2.1-nocfg-dsp"));
900        println!("Sample data is [{:?}]", dsp.generate_record()[0]);
901
902        assert_eq!(dsp.generate_record()[0], "OK".to_string());
903    }
904
905    #[test]
906    // ensure the Data Sample Parser can read all the headers from teh csv file
907    fn test_read_headers() {
908        let mut dsp = DataSampleParser::new();
909
910        dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None)
911            .unwrap();
912        let headers = dsp.extract_headers();
913
914        assert_eq!(headers.len(), 2);
915    }
916
917    #[test]
918    // ensure the Data Sample Parser can read all the headers from teh csv file
919    fn test_read_headers_order() {
920        let mut expected = Vec::new();
921        expected.push("column-Z");
922        expected.push("column-D");
923        expected.push("column-A");
924        expected.push("column-G");
925        let mut dsp = DataSampleParser::new();
926
927        dsp.analyze_csv_file(&String::from("./tests/samples/sample-02.csv"), None)
928            .unwrap();
929        let headers = dsp.extract_headers();
930
931        assert_eq!(headers, expected);
932    }
933
934    #[test]
935    // ensure DataSampleParser can analyze a csv formatted file
936    fn test_parse_csv_file() {
937        let mut dsp = DataSampleParser::new();
938
939        assert_eq!(
940            dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None)
941                .unwrap(),
942            1
943        );
944    }
945
946    #[test]
947    // ensure DataSampleParser can analyze a csv formatted text
948    fn test_parse_csv_data_using_defaults() {
949        let mut dsp = DataSampleParser::new();
950        let mut data = String::from("");
951        data.push_str("\"firstname\",\"lastname\"\n");
952        data.push_str("\"Aaron\",\"Aaberg\"\n");
953        data.push_str("\"Aaron\",\"Aaby\"\n");
954        data.push_str("\"Abbey\",\"Aadland\"\n");
955        data.push_str("\"Abbie\",\"Aagaard\"\n");
956        data.push_str("\"Abby\",\"Aakre\"");
957
958        assert_eq!(dsp.analyze_csv_data(&data, None).unwrap(), 1);
959    }
960
961    #[test]
962    // ensure DataSampleParser can analyze a csv formatted text
963    fn test_parse_csv_data() {
964        let mut dsp = DataSampleParser::new();
965        let mut data = String::from("");
966        data.push_str("\"firstname\"|\"lastname\"\n");
967        data.push_str("\"Aaron\"|\"Aaberg\"\n");
968        data.push_str("\"Aaron\"|\"Aaby\"\n");
969        data.push_str("\"Abbey\"|\"Aadland\"\n");
970        data.push_str("\"Abbie\"|\"Aagaard\"\n");
971        data.push_str("\"Abby\"|\"Aakre\"");
972
973        assert_eq!(dsp.analyze_csv_data(&data, Some(b'|')).unwrap(), 1);
974    }
975    #[test]
976    // ensure DataSampleParser can analyze a csv formatted file
977    fn test_generate_field_from_csv_file() {
978        let mut dsp = DataSampleParser::new();
979
980        dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None)
981            .unwrap();
982        println!(
983            "Generated data for first name {}",
984            dsp.generate_by_field_name("firstname".to_string())
985        );
986    }
987
988    #[test]
989    // ensure DataSampleParser can analyze a csv formatted file
990    fn test_generate_record_from_csv_file() {
991        let mut dsp = DataSampleParser::new();
992
993        dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None)
994            .unwrap();
995        assert_eq!(dsp.generate_record().len(), 2);
996    }
997
998    #[test]
999    // ensure DataSampleParser can analyze a csv formatted file
1000    fn test_parse_csv_file_bad() {
1001        let mut dsp = DataSampleParser::new();
1002
1003        assert_eq!(
1004            dsp.analyze_csv_file(&String::from("./badpath/sample-01.csv"), None)
1005                .is_err(),
1006            true
1007        );
1008    }
1009
1010    #[test]
1011    // ensure the DataSampleParser object can be saved to file
1012    fn test_save() {
1013        let mut dsp = DataSampleParser::new();
1014        dsp.analyze_csv_file(&String::from("./tests/samples/sample-00.csv"), None)
1015            .unwrap();
1016
1017        assert_eq!(
1018            dsp.save(&String::from("./tests/samples/sample-00-dsp"))
1019                .unwrap(),
1020            true
1021        );
1022    }
1023
1024    #[test]
1025    // ensure the DataSampleParser object can recognize the difference between realistic data and unrealistic generated data
1026    fn test_levenshtein_test() {
1027        let mut dsp = DataSampleParser::new();
1028
1029        assert_eq!(
1030            dsp.levenshtein_distance(&"kitten".to_string(), &"sitting".to_string()),
1031            3 as usize
1032        );
1033    }
1034
1035    #[test]
1036    // ensure the DataSampleParser object can recognize the difference between realistic data and unrealistic generated data
1037    fn test_realistic_data_test() {
1038        let mut dsp = DataSampleParser::new();
1039
1040        assert_eq!(
1041            dsp.realistic_test(&"kitten".to_string(), &"sitting".to_string()),
1042            76.92307692307692 as f64
1043        );
1044    }
1045
1046    #[test]
1047    // demo test
1048    fn test_demo() {
1049        let mut dsp = DataSampleParser::new();
1050        dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None)
1051            .unwrap();
1052
1053        println!(
1054            "My new name is {} {}",
1055            dsp.generate_record()[0],
1056            dsp.generate_record()[1]
1057        );
1058
1059        assert!(true);
1060    }
1061
1062    #[test]
1063    // ensure the DataSampleParser object can generate test data as a csv file
1064    fn test_extract_headers_from_sample() {
1065        let mut dsp = DataSampleParser::new();
1066
1067        dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None)
1068            .unwrap();
1069        let headers = dsp.extract_headers();
1070
1071        assert_eq!(headers.len(), 2);
1072    }
1073
1074    #[test]
1075    // ensure the DataSampleParser object can generate test data as a csv file
1076    fn test_generate_csv_test_data_from_sample() {
1077        let mut dsp = DataSampleParser::new();
1078
1079        dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None)
1080            .unwrap();
1081        dsp.generate_csv(
1082            100,
1083            &String::from("./tests/samples/generated-01b.csv"),
1084            Some(b'|'),
1085        )
1086        .unwrap();
1087
1088        let generated_row_count =
1089            match File::open(format!("{}", "./tests/samples/generated-01b.csv")) {
1090                Err(_e) => 0,
1091                Ok(f) => {
1092                    let mut count = 0;
1093                    let bf = BufReader::new(f);
1094
1095                    for _line in bf.lines() {
1096                        count += 1;
1097                    }
1098
1099                    count
1100                }
1101            };
1102
1103        assert_eq!(generated_row_count, 101);
1104    }
1105}