test_data_generation/data_sample_parser.rs
1//! The `data_sample_parser` module provides functionality to read sample data, parse and analyze it,
2//! so that test data can be generated based on profiles.
3//!
4//! # Examples
5//!
6//!
7//! Generate some demo test data ...
8//!
9//! ```
10//! extern crate test_data_generation;
11//!
12//! use test_data_generation::data_sample_parser::DataSampleParser;
13//!
14//! fn main() {
15//! // initalize a new DataSampelParser
16//! let dsp = DataSampleParser::new();
17//!
18//! // generate some test data using the demo functions
19//! println!("generate date:{}", dsp.demo_date());
20//! println!("generate person:{}", dsp.demo_person_name());
21//! }
22//! ```
23//!
24//! Save the algorithm ...
25//!
26//! Archive (export) the data sample parser object so that you can reuse the algorithm to generate test data at a later time.
27//! This enables you to persist the algorithm without having to store the actual data sample that was used to create the algorithm -
28//! Which is important if you used 'real' data in your sample data.
29//!
30//! ```
31//! extern crate test_data_generation;
32//!
33//! use test_data_generation::data_sample_parser::DataSampleParser;
34//!
35//! fn main() {
36//! // analyze the dataset
37//! let mut dsp = DataSampleParser::new();
38//!
39//! assert_eq!(dsp.save(&String::from("./tests/samples/empty-dsp")).unwrap(), true);
40//! }
41//! ```
42//!
43//! Load an algorithm ...
44//!
45//! Create a data sample parser from a previously saved (exported) archive file so you can generate test data based on the algorithm.</br>
46//! *NOTE:* In this example, there was only one data point in the data sample that was analyzed (the word 'OK'). This was intentional
47//! so the algorithm would be guaranteed to generate that same word. This was done ensure the assert_eq! returns true.
48//!
49//! ```
50//! extern crate test_data_generation;
51//!
52//! use test_data_generation::data_sample_parser::DataSampleParser;
53//!
54//! fn main() {
55//! let mut dsp = DataSampleParser::from_file(&String::from("./tests/samples/sample-00-dsp"));
56//!
57//! assert_eq!(dsp.generate_record()[0], "OK".to_string());
58//! }
59//! ```
60//!
61//! You can also generate a new csv file based on the data sample provided.
62//!
63//! ```
64//! extern crate test_data_generation;
65//!
66//! use test_data_generation::data_sample_parser::DataSampleParser;
67//!
68//! fn main() {
69//! let mut dsp = DataSampleParser::new();
70//!
71//! // Using the default delimiter (comma)
72//! dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None).unwrap();
73//! dsp.generate_csv(100, &String::from("./tests/samples/generated-01.csv"), None).unwrap();
74//! }
75//! ```
76//!
77
78// use std::collections::BTreeMap;
79use crate::configs::Configs;
80use crate::engine::{Engine, EngineContainer};
81use crate::shared::CsvManipulator;
82use crate::Profile;
83use csv;
84use indexmap::IndexMap;
85use std::fs::File;
86use std::io;
87use std::io::prelude::*;
88use std::io::Write;
89use std::result::Result;
90//use csv::StringRecord;
91use csv::WriterBuilder;
92use serde_json;
93use serde_json::Value;
94use std::error::Error;
95
96use std::sync::mpsc;
97use std::sync::mpsc::{Receiver, Sender};
98use std::thread;
99
100const DELIMITER: u8 = b',';
101
102type ProfilesMap = IndexMap<String, Profile>;
103
104#[derive(Serialize, Deserialize, Debug)]
105/// Represents the Parser for sample data to be used
106pub struct DataSampleParser {
107 /// indicates if there were issues parsing and anlyzing the data sample
108 pub issues: bool,
109 /// Configs object that define the configuration settings
110 cfg: Option<Configs>,
111 /// List of Profiles objects identified by a unique profile name LinkedHashMap<String, Profile>
112 #[serde(with = "indexmap::serde_seq")]
113 profiles: ProfilesMap,
114}
115
116impl CsvManipulator for DataSampleParser {}
117impl Engine for DataSampleParser {}
118
119impl DataSampleParser {
120 /// Constructs a new DataSampleParser
121 ///
122 /// #Example
123 ///
124 /// ```
125 /// extern crate test_data_generation;
126 ///
127 /// use test_data_generation::data_sample_parser::DataSampleParser;
128 ///
129 /// fn main() {
130 /// // initalize a new DataSampelParser
131 /// let dsp = DataSampleParser::new();
132 /// }
133 /// ```
134 pub fn new() -> DataSampleParser {
135 DataSampleParser {
136 issues: false,
137 cfg: None,
138 profiles: ProfilesMap::new(),
139 }
140 }
141
142 /// Constructs a new DataSampleParser
143 ///
144 /// # Arguments
145 ///
146 /// * `path: &String - The full path name (including the file name and extension) to the configuration file.</br>
147 ///
148 /// #Example
149 ///
150 /// ```
151 /// extern crate test_data_generation;
152 ///
153 /// use test_data_generation::data_sample_parser::DataSampleParser;
154 ///
155 /// fn main() {
156 /// // initalize a new DataSampelParser
157 /// // param: the path to the configuration file
158 /// let dsp = DataSampleParser::new_with(&String::from("./config/tdg.yaml"));
159 /// }
160 /// ```
161 pub fn new_with(path: &String) -> DataSampleParser {
162 DataSampleParser {
163 issues: false,
164 cfg: Some(Configs::new(path)),
165 profiles: ProfilesMap::new(),
166 }
167 }
168
169 /// Constructs a new DataSampleParser from an exported JSON file. This is used when restoring from "archive"
170 ///
171 /// # Arguments
172 ///
173 /// * `path: &String` - The full path name of the json formatted Data Sample Parser archive file.</br>
174 ///
175 /// #Example
176 ///
177 /// ```
178 /// extern crate test_data_generation;
179 ///
180 /// use test_data_generation::data_sample_parser::DataSampleParser;
181 ///
182 /// fn main() {
183 /// let mut dsp = DataSampleParser::from_file(&String::from("./tests/samples/sample-00-dsp"));
184 ///
185 /// assert_eq!(dsp.generate_record()[0], "OK".to_string());
186 /// }
187 /// ```
188 pub fn from_file(path: &String) -> DataSampleParser {
189 // open the archive file
190 let mut file = match File::open(format!("{}.json", &path)) {
191 Err(_e) => {
192 error!("Could not open file {:?}", &path.to_string());
193 panic!("Could not open file {:?}", &path.to_string());
194 }
195 Ok(f) => {
196 info!("Successfully opened file {:?}", &path.to_string());
197 f
198 }
199 };
200
201 //read the archive file
202 let mut serialized = String::new();
203 match file.read_to_string(&mut serialized) {
204 Err(e) => {
205 error!(
206 "Could not read file {:?} because of {:?}",
207 &path.to_string(),
208 e.to_string()
209 );
210 panic!(
211 "Could not read file {:?} because of {:?}",
212 &path.to_string(),
213 e.to_string()
214 );
215 }
216 Ok(s) => {
217 info!("Successfully read file {:?}", &path.to_string());
218 s
219 }
220 };
221
222 // Support backwards compatibility for DSP saved using prior versions
223 let dsp: Value = serde_json::from_str(&serialized).unwrap();
224 let prfils = dsp.get("profiles").unwrap();
225
226 match prfils.is_array() {
227 true => {
228 debug!("Version 0.3.0 detected. Using latest version");
229 return serde_json::from_str(&serialized).unwrap();
230 }
231 false => {
232 info!("Prior version 0.2.1 detected. Trying to upgrade to latest version");
233
234 return Self::upgrade_to_latest_version(serialized);
235 }
236 }
237 }
238
239 fn upgrade_to_latest_version(serialized: String) -> DataSampleParser {
240 let dsp: Value = serde_json::from_str(&serialized).unwrap();
241 let prfils = dsp.get("profiles").unwrap();
242 let mut pm: ProfilesMap = ProfilesMap::new();
243 let issues = dsp.get("issues").unwrap().as_bool().unwrap();
244
245 for prf in prfils.as_object().iter() {
246 for attr in prf.keys() {
247 let id = prf
248 .get(attr)
249 .unwrap()
250 .as_object()
251 .unwrap()
252 .get("id")
253 .unwrap()
254 .as_str()
255 .unwrap()
256 .to_string();
257 let serl = &serde_json::to_string(prf.get(attr).unwrap()).unwrap();
258 println!("{:?} : {:?}", id, serl);
259 pm.insert(id, Profile::from_serialized(serl));
260 }
261 }
262
263 let mut rtn = match dsp.get("cfg").unwrap() {
264 serde_json::Value::Null => DataSampleParser::new(),
265 _ => DataSampleParser::new_with(
266 &dsp.get("cfg")
267 .unwrap()
268 .as_object()
269 .unwrap()
270 .get("file")
271 .unwrap()
272 .as_str()
273 .unwrap()
274 .to_string(),
275 ),
276 };
277
278 rtn.issues = issues;
279 rtn.profiles = pm;
280 return rtn;
281 }
282
283 #[inline]
284 fn analyze_columns(&mut self, profile_keys: Vec<String>, columns: Vec<Vec<String>>) {
285 let col_cnt = columns.len();
286 let (tx, rx): (
287 Sender<Result<Profile, String>>,
288 Receiver<Result<Profile, String>>,
289 ) = mpsc::channel();
290 let mut jobs = Vec::new();
291
292 //iterate through all the columns
293 for (idx, column) in columns.iter().enumerate() {
294 let thread_tx = tx.clone();
295 let container = EngineContainer {
296 profile: self.profiles.get(&profile_keys[idx]).unwrap().clone(),
297 entities: column.to_vec(),
298 };
299
300 let job = thread::spawn(move || {
301 let result = Self::profile_entities_with_container(container);
302 thread_tx.send(result).unwrap();
303 });
304
305 jobs.push(job);
306 }
307
308 let mut results = Vec::with_capacity(col_cnt);
309 for _ in 0..col_cnt {
310 results.push(rx.recv());
311 }
312
313 for job in jobs {
314 job.join().expect("Error: Could not run the job");
315 }
316
317 for result in results {
318 match result {
319 Ok(msg) => {
320 //received from sender
321 match msg {
322 Ok(p) => {
323 let id = p.clone().id.unwrap();
324 debug!("Profile {} has finished analyzing the entities.", id);
325 self.profiles.insert(id, p);
326 }
327 Err(e) => {
328 error!(
329 "Profile wasn't able to analyzing the entities. Error: {}",
330 e
331 );
332 }
333 }
334 }
335 Err(e) => {
336 // could not receive from sender
337 error!("Receiver wasn't able to receive message from sender which was analyzing entities for the profile. Error: {}", e);
338 panic!("Receiver wasn't able to receive message from sender which was analyzing entities for the profile. Error: {}", e);
339 }
340 }
341 }
342 // Multi-Threading END
343 }
344
345 /// This function analyzes sample data that is a csv formatted string and returns a boolean if successful.
346 /// _NOTE:_ The csv properties are as follows:
347 /// + headers are included as first line
348 /// + double quote wrap text
349 /// + double quote escapes is enabled
350 /// + delimiter is a comma
351 ///
352 ///
353 /// # Arguments
354 ///
355 /// * `data: &String` - The textual content of a csv formatted sample data file.</br>
356 /// * `delimiter: Option<u8>` - The delimiter to use, otherwise use the default.</br>
357 ///
358 /// # Example
359 ///
360 /// ```
361 /// extern crate test_data_generation;
362 ///
363 /// use test_data_generation::data_sample_parser::DataSampleParser;
364 ///
365 /// fn main() {
366 /// // initalize a new DataSampelParser
367 /// let mut dsp = DataSampleParser::new();
368 /// let mut data = String::from("");
369 /// data.push_str("\"firstname\",\"lastname\"\n");
370 /// data.push_str("\"Aaron\",\"Aaberg\"\n");
371 /// data.push_str("\"Aaron\",\"Aaby\"\n");
372 /// data.push_str("\"Abbey\",\"Aadland\"\n");
373 /// data.push_str("\"Abbie\",\"Aagaard\"\n");
374 /// data.push_str("\"Abby\",\"Aakre\"");
375 ///
376 /// // Use the default delimiter (comma)
377 /// assert_eq!(dsp.analyze_csv_data(&data, None).unwrap(),1);
378 /// }
379 /// ```
380 pub fn analyze_csv_data(
381 &mut self,
382 data: &String,
383 delimiter: Option<u8>,
384 ) -> Result<i32, String> {
385 debug!("Starting to analyzed the csv data {}", data);
386
387 let mut rdr = csv::ReaderBuilder::new()
388 .has_headers(true)
389 .quote(b'"')
390 .double_quote(true)
391 .delimiter(Self::else_default_delimiter(delimiter))
392 .from_reader(data.as_bytes());
393
394 //iterate through the headers
395 for headers in rdr.headers() {
396 for header in headers.iter() {
397 //add a Profile to the list of profiles to represent the field (indexed using the header label)
398 let p = Profile::new_with_id(header.to_string());
399 self.profiles.insert(header.to_string(), p);
400 }
401 }
402
403 //create a Vec from all the keys (headers) in the profiles list
404 let profile_keys: Vec<_> = self.profiles.keys().cloned().collect();
405
406 debug!("CSV headers: {:?}", profile_keys);
407
408 // Multi-Threading START
409 let columns = Self::read_as_columns(rdr);
410 //let col_cnt = columns.len();
411 let rec_cnt = columns[0].len();
412 self.analyze_columns(profile_keys, columns);
413
414 debug!("Successfully analyzed the csv data");
415 debug!(
416 "Analyzed {} records, {} fields",
417 rec_cnt,
418 self.profiles.len()
419 );
420
421 //prepare the profiles for data generation
422 self.profiles.iter_mut().for_each(|p| p.1.pre_generate());
423
424 Ok(1)
425 }
426
427 /// This function analyzes sample data that is a csv formatted file and returns a boolean if successful.
428 /// _NOTE:_ The csv properties are as follows:
429 /// + headers are included as first line
430 /// + double quote wrap text
431 /// + double quote escapes is enabled
432 /// + delimiter is a comma
433 ///
434 ///
435 /// # Arguments
436 ///
437 /// * `path: &String` - The full path name of the csv formatted sample data file.</br>
438 /// * `delimiter: Option<u8>` - The delimiter to use, otherwise use the default.</br>
439 ///
440 /// # Example
441 ///
442 /// ```
443 /// extern crate test_data_generation;
444 ///
445 /// use test_data_generation::data_sample_parser::DataSampleParser;
446 ///
447 /// fn main() {
448 /// // initalize a new DataSampelParser
449 /// let mut dsp = DataSampleParser::new();
450 ///
451 /// // Use the default delimiter (comma)
452 /// assert_eq!(dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None).unwrap(),1);
453 /// }
454 /// ```
455 pub fn analyze_csv_file(
456 &mut self,
457 path: &String,
458 delimiter: Option<u8>,
459 ) -> Result<i32, String> {
460 info!("Starting to analyzed the csv file {}", path);
461
462 let mut file = (File::open(path).map_err(|e| {
463 error!("csv file {} couldn't be opened!", path);
464 e.to_string()
465 }))?;
466
467 let mut data = String::new();
468 file.read_to_string(&mut data)
469 .map_err(|e| {
470 error!("csv file {} couldn't be read!", path);
471 e.to_string()
472 })
473 .unwrap();
474
475 self.analyze_csv_data(&data, delimiter)
476 }
477
478 /// This function generates date as strings using the a `demo` profile
479 ///
480 /// # Example
481 ///
482 /// ```
483 /// extern crate test_data_generation;
484 ///
485 /// use test_data_generation::data_sample_parser::DataSampleParser;
486 ///
487 /// fn main() {
488 /// // initalize a new DataSampelParser
489 /// let dsp = DataSampleParser::new();
490 ///
491 /// // generate some test data using the demo functions
492 /// println!("generate date:{}", dsp.demo_date());
493 /// }
494 /// ```
495 pub fn demo_date(&self) -> String {
496 let mut profil = Profile::new();
497
498 profil.analyze("01/04/2017");
499 profil.analyze("02/09/2017");
500 profil.analyze("03/13/2017");
501 profil.analyze("04/17/2017");
502 profil.analyze("05/22/2017");
503 profil.analyze("07/26/2017");
504 profil.analyze("08/30/2017");
505 profil.analyze("09/07/2017");
506 profil.analyze("10/11/2017");
507 profil.analyze("11/15/2017");
508 profil.analyze("12/21/2017");
509 profil.analyze("01/14/2016");
510 profil.analyze("02/19/2016");
511 profil.analyze("03/23/2016");
512 profil.analyze("04/27/2016");
513 profil.analyze("05/02/2016");
514 profil.analyze("07/16/2015");
515 profil.analyze("08/20/2015");
516 profil.analyze("09/17/2015");
517 profil.analyze("10/01/2014");
518 profil.analyze("11/25/2014");
519 profil.analyze("12/31/2018");
520
521 profil.pre_generate();
522 //profil.apply_facts("##p##p####".to_string())
523 profil.generate()
524 }
525
526 /// This function generates people's names as strings using the a `demo` profile
527 ///
528 /// # Example
529 ///
530 /// ```
531 /// extern crate test_data_generation;
532 ///
533 /// use test_data_generation::data_sample_parser::DataSampleParser;
534 ///
535 /// fn main() {
536 /// // initalize a new DataSampelParser
537 /// let dsp = DataSampleParser::new();
538 ///
539 /// // generate some test data using the demo functions
540 /// println!("generate date:{}", dsp.demo_person_name());
541 /// }
542 pub fn demo_person_name(&self) -> String {
543 let mut profil = Profile::new();
544
545 profil.analyze("Smith, John");
546 profil.analyze("O'Brien, Henny");
547 profil.analyze("Dale, Danny");
548 profil.analyze("Rickets, Ronnae");
549 profil.analyze("Richard, Richie");
550 profil.analyze("Roberts, Blake");
551 profil.analyze("Conways, Sephen");
552
553 profil.pre_generate();
554 profil.generate()
555 }
556
557 fn else_default_delimiter(delimiter: Option<u8>) -> u8 {
558 match delimiter {
559 Some(d) => {
560 return d;
561 }
562 None => {
563 return DELIMITER;
564 }
565 }
566 }
567
568 /// This function returns a vector of header names
569 ///
570 /// # Example
571 ///
572 /// ```
573 /// extern crate test_data_generation;
574 ///
575 /// use test_data_generation::data_sample_parser::DataSampleParser;
576 ///
577 /// fn main() {
578 /// // initalize a new DataSampelParser
579 /// let mut dsp = DataSampleParser::new();
580 ///
581 /// dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None).unwrap();
582 /// let headers = dsp.extract_headers();
583 ///
584 /// assert_eq!(headers.len(), 2);
585 /// }
586 pub fn extract_headers(&mut self) -> Vec<String> {
587 let mut headers = vec![];
588
589 for profile in self.profiles.iter_mut() {
590 headers.push(profile.0.to_string());
591 }
592
593 headers
594 }
595
596 /// This function generates test data for the specified field name.
597 ///
598 /// # Arguments
599 ///
600 /// * `field: String` - The name of the field (e.g.: firstname) the represents the profile to use when generating the test data.</br>
601 ///
602 /// # Example
603 ///
604 /// ```
605 /// extern crate test_data_generation;
606 ///
607 /// use test_data_generation::data_sample_parser::DataSampleParser;
608 ///
609 /// fn main() {
610 /// // initalize a new DataSampelParser
611 /// let mut dsp = DataSampleParser::new();
612 ///
613 /// dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None).unwrap();
614 /// println!("Generated data for first name {}",dsp.generate_by_field_name("firstname".to_string()));
615 /// }
616 /// ```
617 pub fn generate_by_field_name(&mut self, field: String) -> String {
618 self.profiles
619 .get_mut(&field)
620 .unwrap()
621 .generate()
622 .to_string()
623 }
624
625 /// This function Vec of generates test data fields.
626 ///
627 /// # Example
628 ///
629 /// ```
630 /// extern crate test_data_generation;
631 ///
632 /// use test_data_generation::data_sample_parser::DataSampleParser;
633 ///
634 /// fn main() {
635 /// // initalize a new DataSampelParser
636 /// let mut dsp = DataSampleParser::new();
637 ///
638 /// dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None).unwrap();
639 /// println!("Generated data record: {:?}",dsp.generate_record());
640 /// }
641 /// ```
642 pub fn generate_record(&mut self) -> Vec<String> {
643 let mut record = Vec::new();
644
645 for profile in self.profiles.iter_mut() {
646 record.push(profile.1.generate().to_string());
647 }
648
649 record
650 }
651
652 /// This function creates a csv file of generated test data.
653 /// Prior to calling this funciton, you need to call the analyze_csv_file() function.
654 /// _NOTE:_ The csv properties are as follows:
655 /// + headers are included as first line
656 /// + double quotes wrap text
657 /// + double quote escapes is enabled
658 /// + delimiter is a comma
659 ///
660 ///
661 /// # Arguments
662 ///
663 /// * `row_count: u32` - The number of rows to generate.</br>
664 /// * `path: &String` - The full path name where to save the csv file.</br>
665 /// * `delimiter: Option<u8>` - The delimiter to use, otherwise use the default.</br>
666 ///
667 /// # Example
668 ///
669 /// ```
670 /// extern crate test_data_generation;
671 ///
672 /// use test_data_generation::data_sample_parser::DataSampleParser;
673 ///
674 /// fn main() {
675 /// // initalize a new DataSampelParser
676 /// let mut dsp = DataSampleParser::new();
677 ///
678 /// dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None).unwrap();
679 /// dsp.generate_csv(100, &String::from("./tests/samples/generated-01.csv"), None).unwrap();
680 /// }
681 /// ```
682 pub fn generate_csv(
683 &mut self,
684 row_count: u32,
685 path: &String,
686 delimiter: Option<u8>,
687 ) -> Result<(), Box<dyn Error>> {
688 info!("generating csv file {}", path);
689
690 let mut wtr = (WriterBuilder::new()
691 .has_headers(true)
692 .quote(b'"')
693 .double_quote(true)
694 .delimiter(Self::else_default_delimiter(delimiter))
695 .from_path(path)
696 .map_err(|e| {
697 error!("csv file {} couldn't be created!", path);
698 e.to_string()
699 }))?;
700
701 let headers = self.extract_headers();
702 wtr.write_record(&headers)?;
703
704 for _r in 0..row_count {
705 let mut record = Vec::new();
706
707 for profile in self.profiles.iter_mut() {
708 record.push(profile.1.generate());
709 }
710
711 wtr.write_record(&record)?;
712 }
713
714 wtr.flush()?;
715
716 Ok(())
717 }
718
719 /// This function calculates the levenshtein distance between 2 strings.
720 /// See: https://crates.io/crates/levenshtein
721 ///
722 /// # Arguments
723 ///
724 /// * `control: &String` - The string to compare against. This would be the real data from the data sample.</br>
725 /// * `experiment: &String` - The string to compare. This would be the generated data for which you want to find the distance.</br>
726 ///
727 /// #Example
728 ///
729 /// ```
730 /// extern crate test_data_generation;
731 ///
732 /// use test_data_generation::data_sample_parser::DataSampleParser;
733 ///
734 /// fn main() {
735 /// // analyze the dataset
736 /// let mut dsp = DataSampleParser::new();
737 ///
738 /// assert_eq!(dsp.levenshtein_distance(&"kitten".to_string(), &"sitting".to_string()), 3 as usize);
739 /// }
740 ///
741 pub fn levenshtein_distance(&mut self, control: &String, experiment: &String) -> usize {
742 // https://docs.rs/levenshtein/1.0.3/levenshtein/fn.levenshtein.html
743 levenshtein_distance!(control, experiment)
744 }
745
746 /// This function calculates the percent difference between 2 strings.
747 ///
748 /// # Arguments
749 ///
750 /// * `control: &String` - The string to compare against. This would be the real data from the data sample.</br>
751 /// * `experiment: &String` - The string to compare. This would be the generated data for which you want to find the percent difference.</br>
752 ///
753 /// #Example
754 ///
755 /// ```
756 /// extern crate test_data_generation;
757 ///
758 /// use test_data_generation::data_sample_parser::DataSampleParser;
759 ///
760 /// fn main() {
761 /// // analyze the dataset
762 /// let mut dsp = DataSampleParser::new();
763 ///
764 /// assert_eq!(dsp.realistic_test(&"kitten".to_string(), &"sitting".to_string()), 76.92307692307692 as f64);
765 /// }
766 ///
767 pub fn realistic_test(&mut self, control: &String, experiment: &String) -> f64 {
768 //https://docs.rs/GSL/0.4.31/rgsl/statistics/fn.correlation.html
769 //http://www.statisticshowto.com/probability-and-statistics/correlation-coefficient-formula/
770 // pearson's chi square test
771 // cosine similarity - http://blog.christianperone.com/2013/09/machine-learning-cosine-similarity-for-vector-space-models-part-iii/
772 realistic_test!(control, experiment)
773 }
774
775 /// This function returns a boolean that indicates if the data sample parsing had issues
776 ///
777 /// # Example
778 ///
779 /// ```
780 /// extern crate test_data_generation;
781 ///
782 /// use test_data_generation::data_sample_parser::DataSampleParser;
783 ///
784 /// fn main() {
785 /// // initalize a new DataSampelParser
786 /// // param: the path to the configuration file is wrong
787 /// let dsp = DataSampleParser::new_with(&String::from("./target/debug/config/tdg.yaml"));
788 ///
789 /// // generate some test data using the demo functions
790 /// assert_eq!(dsp.running_with_issues(), &false);
791 /// }
792 pub fn running_with_issues(&self) -> &bool {
793 &self.issues
794 }
795
796 /// This function saves (exports) the DataSampleParser to a JSON file.
797 /// This is useful when you wish to reuse the algorithm to generate more test data later.
798 ///
799 /// # Arguments
800 ///
801 /// * `field: &String` - The full path of the export file , excluding the file extension, (e.g.: "./test/data/custom-names").</br>
802 ///
803 /// #Errors
804 /// If this function encounters any form of I/O or other error, an error variant will be returned.
805 /// Otherwise, the function returns Ok(true).</br>
806 ///
807 /// #Example
808 ///
809 /// ```
810 /// extern crate test_data_generation;
811 ///
812 /// use test_data_generation::data_sample_parser::DataSampleParser;
813 ///
814 /// fn main() {
815 /// // analyze the dataset
816 /// let mut dsp = DataSampleParser::new();
817 /// dsp.analyze_csv_file(&String::from("./tests/samples/sample-00.csv"), None).unwrap();
818 ///
819 /// assert_eq!(dsp.save(&String::from("./tests/samples/sample-00-dsp")).unwrap(), true);
820 /// }
821 ///
822 pub fn save(&mut self, path: &String) -> Result<bool, io::Error> {
823 let dsp_json = serde_json::to_string(&self).unwrap();
824
825 // Create the archive file
826 let mut file = match File::create(format!("{}.json", &path)) {
827 Err(e) => {
828 error!("Could not create file {:?}", &path.to_string());
829 return Err(e);
830 }
831 Ok(f) => {
832 info!("Successfully exported to {:?}", &path.to_string());
833 f
834 }
835 };
836
837 // Write the json string to file, returns io::Result<()>
838 match file.write_all(dsp_json.as_bytes()) {
839 Err(e) => {
840 error!("Could not write to file {}", &path.to_string());
841 return Err(e);
842 }
843 Ok(_) => {
844 info!("Successfully exported to {}", &path.to_string());
845 }
846 };
847
848 Ok(true)
849 }
850}
851
852#[cfg(test)]
853mod tests {
854 use super::*;
855 use std::fs::File;
856 use std::io::BufReader;
857
858 #[test]
859 // ensure a new Data Sample Parser can be created
860 fn test_new() {
861 let _dsp = DataSampleParser::new();
862
863 assert!(true);
864 }
865
866 #[test]
867 // ensure a new Data Sample Parser can be created with configurations
868 fn test_new_with() {
869 let _dsp = DataSampleParser::new_with(&String::from("./config/tdg.yaml"));
870
871 assert!(true);
872 }
873
874 #[test]
875 // ensure the Data Sample Parser can be restored from archived file
876 fn test_from_file() {
877 let mut dsp = DataSampleParser::from_file(&String::from("./tests/samples/sample-00-dsp"));
878 println!("Sample data is [{:?}]", dsp.generate_record()[0]);
879
880 assert_eq!(dsp.generate_record()[0], "OK".to_string());
881 }
882
883 #[test]
884 // ensure the Data Sample Parser can be restored from archived file that
885 // was saved using version 0.2.1 using a configuration
886 fn test_from_file_v021_with_cfg() {
887 let mut dsp =
888 DataSampleParser::from_file(&String::from("./tests/samples/sample-0.2.1-dsp"));
889 println!("Sample data is [{:?}]", dsp.generate_record()[0]);
890
891 assert_eq!(dsp.generate_record()[0], "OK".to_string());
892 }
893
894 #[test]
895 // ensure the Data Sample Parser can be restored from archived file that
896 // was saved using version 0.2.1 without a configuration
897 fn test_from_file_v021_no_cfg() {
898 let mut dsp =
899 DataSampleParser::from_file(&String::from("./tests/samples/sample-0.2.1-nocfg-dsp"));
900 println!("Sample data is [{:?}]", dsp.generate_record()[0]);
901
902 assert_eq!(dsp.generate_record()[0], "OK".to_string());
903 }
904
905 #[test]
906 // ensure the Data Sample Parser can read all the headers from teh csv file
907 fn test_read_headers() {
908 let mut dsp = DataSampleParser::new();
909
910 dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None)
911 .unwrap();
912 let headers = dsp.extract_headers();
913
914 assert_eq!(headers.len(), 2);
915 }
916
917 #[test]
918 // ensure the Data Sample Parser can read all the headers from teh csv file
919 fn test_read_headers_order() {
920 let mut expected = Vec::new();
921 expected.push("column-Z");
922 expected.push("column-D");
923 expected.push("column-A");
924 expected.push("column-G");
925 let mut dsp = DataSampleParser::new();
926
927 dsp.analyze_csv_file(&String::from("./tests/samples/sample-02.csv"), None)
928 .unwrap();
929 let headers = dsp.extract_headers();
930
931 assert_eq!(headers, expected);
932 }
933
934 #[test]
935 // ensure DataSampleParser can analyze a csv formatted file
936 fn test_parse_csv_file() {
937 let mut dsp = DataSampleParser::new();
938
939 assert_eq!(
940 dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None)
941 .unwrap(),
942 1
943 );
944 }
945
946 #[test]
947 // ensure DataSampleParser can analyze a csv formatted text
948 fn test_parse_csv_data_using_defaults() {
949 let mut dsp = DataSampleParser::new();
950 let mut data = String::from("");
951 data.push_str("\"firstname\",\"lastname\"\n");
952 data.push_str("\"Aaron\",\"Aaberg\"\n");
953 data.push_str("\"Aaron\",\"Aaby\"\n");
954 data.push_str("\"Abbey\",\"Aadland\"\n");
955 data.push_str("\"Abbie\",\"Aagaard\"\n");
956 data.push_str("\"Abby\",\"Aakre\"");
957
958 assert_eq!(dsp.analyze_csv_data(&data, None).unwrap(), 1);
959 }
960
961 #[test]
962 // ensure DataSampleParser can analyze a csv formatted text
963 fn test_parse_csv_data() {
964 let mut dsp = DataSampleParser::new();
965 let mut data = String::from("");
966 data.push_str("\"firstname\"|\"lastname\"\n");
967 data.push_str("\"Aaron\"|\"Aaberg\"\n");
968 data.push_str("\"Aaron\"|\"Aaby\"\n");
969 data.push_str("\"Abbey\"|\"Aadland\"\n");
970 data.push_str("\"Abbie\"|\"Aagaard\"\n");
971 data.push_str("\"Abby\"|\"Aakre\"");
972
973 assert_eq!(dsp.analyze_csv_data(&data, Some(b'|')).unwrap(), 1);
974 }
975 #[test]
976 // ensure DataSampleParser can analyze a csv formatted file
977 fn test_generate_field_from_csv_file() {
978 let mut dsp = DataSampleParser::new();
979
980 dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None)
981 .unwrap();
982 println!(
983 "Generated data for first name {}",
984 dsp.generate_by_field_name("firstname".to_string())
985 );
986 }
987
988 #[test]
989 // ensure DataSampleParser can analyze a csv formatted file
990 fn test_generate_record_from_csv_file() {
991 let mut dsp = DataSampleParser::new();
992
993 dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None)
994 .unwrap();
995 assert_eq!(dsp.generate_record().len(), 2);
996 }
997
998 #[test]
999 // ensure DataSampleParser can analyze a csv formatted file
1000 fn test_parse_csv_file_bad() {
1001 let mut dsp = DataSampleParser::new();
1002
1003 assert_eq!(
1004 dsp.analyze_csv_file(&String::from("./badpath/sample-01.csv"), None)
1005 .is_err(),
1006 true
1007 );
1008 }
1009
1010 #[test]
1011 // ensure the DataSampleParser object can be saved to file
1012 fn test_save() {
1013 let mut dsp = DataSampleParser::new();
1014 dsp.analyze_csv_file(&String::from("./tests/samples/sample-00.csv"), None)
1015 .unwrap();
1016
1017 assert_eq!(
1018 dsp.save(&String::from("./tests/samples/sample-00-dsp"))
1019 .unwrap(),
1020 true
1021 );
1022 }
1023
1024 #[test]
1025 // ensure the DataSampleParser object can recognize the difference between realistic data and unrealistic generated data
1026 fn test_levenshtein_test() {
1027 let mut dsp = DataSampleParser::new();
1028
1029 assert_eq!(
1030 dsp.levenshtein_distance(&"kitten".to_string(), &"sitting".to_string()),
1031 3 as usize
1032 );
1033 }
1034
1035 #[test]
1036 // ensure the DataSampleParser object can recognize the difference between realistic data and unrealistic generated data
1037 fn test_realistic_data_test() {
1038 let mut dsp = DataSampleParser::new();
1039
1040 assert_eq!(
1041 dsp.realistic_test(&"kitten".to_string(), &"sitting".to_string()),
1042 76.92307692307692 as f64
1043 );
1044 }
1045
1046 #[test]
1047 // demo test
1048 fn test_demo() {
1049 let mut dsp = DataSampleParser::new();
1050 dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None)
1051 .unwrap();
1052
1053 println!(
1054 "My new name is {} {}",
1055 dsp.generate_record()[0],
1056 dsp.generate_record()[1]
1057 );
1058
1059 assert!(true);
1060 }
1061
1062 #[test]
1063 // ensure the DataSampleParser object can generate test data as a csv file
1064 fn test_extract_headers_from_sample() {
1065 let mut dsp = DataSampleParser::new();
1066
1067 dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None)
1068 .unwrap();
1069 let headers = dsp.extract_headers();
1070
1071 assert_eq!(headers.len(), 2);
1072 }
1073
1074 #[test]
1075 // ensure the DataSampleParser object can generate test data as a csv file
1076 fn test_generate_csv_test_data_from_sample() {
1077 let mut dsp = DataSampleParser::new();
1078
1079 dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv"), None)
1080 .unwrap();
1081 dsp.generate_csv(
1082 100,
1083 &String::from("./tests/samples/generated-01b.csv"),
1084 Some(b'|'),
1085 )
1086 .unwrap();
1087
1088 let generated_row_count =
1089 match File::open(format!("{}", "./tests/samples/generated-01b.csv")) {
1090 Err(_e) => 0,
1091 Ok(f) => {
1092 let mut count = 0;
1093 let bf = BufReader::new(f);
1094
1095 for _line in bf.lines() {
1096 count += 1;
1097 }
1098
1099 count
1100 }
1101 };
1102
1103 assert_eq!(generated_row_count, 101);
1104 }
1105}