cifra/attack/
dictionaries.rs

1/// Module to deal with words dictionaries.
2///
3/// A dictionary is a repository of distinct words present in an actual language.
4use std::collections::{HashSet, HashMap};
5use std::path::Path;
6// use std::error::Error;
7use std::fs::File;
8// use std::fmt;
9// use std::fmt::{Display, Formatter};
10use diesel::RunQueryDsl;
11use diesel::prelude::*;
12
13use crate::attack::database::{Database, DatabaseSession, NewLanguage, NewWord};
14use crate::cipher::common::normalize_text;
15use crate::{Result, ErrorKind, ResultExt};
16// use crate::schema::*;
17use crate::schema::languages;
18use crate::schema::languages::dsl::*;
19use crate::schema::words;
20use crate::schema::words::dsl::*;
21// use diesel::result::Error::DatabaseError;
22// use regex::Regex;
23use std::io::Read;
24use std::iter::FromIterator;
25
26
27/// Cifra stores word dictionaries in a local database. This class
28/// is a wrapper to not to deal directly with that database.
29///
30/// This class is intended to be used as a context manager so you don't have
31/// to deal with opening and closing this dictionary. So, call this method
32/// as a context manager, it will return this instance so you can call
33/// further methods to manage its words.
34pub struct Dictionary {
35    pub language: String,
36    language_id: i32,
37    database: Database
38}
39
40impl Dictionary {
41    /// Remove given language from database.
42    ///
43    /// Be aware that all its words will be removed too.
44    ///
45    /// # Parameters:
46    /// * language: Language to remove from database.
47    pub fn remove_dictionary<T>(_language: T)-> Result<()>
48        where T: AsRef<str> {
49        let database = Database::new()?;
50        diesel::delete(languages::table.filter(language.eq(_language.as_ref())))
51            .execute(&database.session)
52            .chain_err(|| ErrorKind::DatabaseError(String::from("Error deleting language.")))?;
53        Ok(())
54    }
55
56    /// Get languages dictionaries present at database.
57    ///
58    /// # Returns:
59    /// * A list with names of dictionaries present at database.
60    pub fn get_dictionaries_names()-> Result<Vec<String>> {
61        let database = Database::new()?;
62        let dictionaries_names = languages::table.select(languages::language)
63            .load::<String>(&database.session)
64            .chain_err(|| ErrorKind::DatabaseError(String::from("Language list could not be retrieved.")))?;
65        Ok(dictionaries_names)
66    }
67
68    /// # Parameters:
69    /// * language: Language you want to manage its words.
70    /// * create: Whether this language should be created in database if not present yet.
71    ///    It defaults to False. If it is set to False and language is not already present at
72    ///    database then a dictionaries.NotExistingLanguage exception is raised, but if it is
73    ///    set to True then language is registered in database as a new language.
74    pub fn new<T>(_language: T, create: bool)-> Result<Self>
75        where T: AsRef<str> {
76        let new_language = _language.as_ref().to_string();
77        let current_database = Database::new()?;
78        let mut current_dictionary = Dictionary {
79            language: new_language.clone(),
80            language_id: 0,
81            database:current_database
82        };
83        if current_dictionary.already_created() {
84            current_dictionary.language_id = languages::table.filter(language.eq(&current_dictionary.language))
85                .select(languages::id)
86                .first::<i32>(current_dictionary.session())
87                .expect("Language that does not exists in database yet.");
88        } else {
89            if create {
90                current_dictionary.create_dictionary();
91            } else {
92                 bail!(ErrorKind::NotExistingLanguage(new_language.clone()))
93            }
94        }
95        Ok(current_dictionary)
96    }
97
98    /// Get open session for current dictionary database.
99    pub fn session(&self) -> &DatabaseSession {
100        &self.database.session
101    }
102
103    /// Add given word to dictionary.
104    ///
105    /// If word is already present at dictionary, do nothing.
106    ///
107    /// # Parameters:
108    /// * word: word to add to dictionary.
109    pub fn add_word<T>(&mut self, _word: T)
110        where T: AsRef<str> {
111        let new_word = NewWord {
112            word: _word.as_ref(),
113            word_pattern: get_word_pattern(_word.as_ref()),
114            language_id: self.language_id
115        };
116        diesel::insert_into(words::table)
117            .values(&new_word)
118            .execute(self.session())
119            .expect("Error saving new word.");
120    }
121
122    /// Add given words to dictionary.
123    ///
124    /// # Parameters:
125    /// * words: Set of words to add to dictionary.
126    pub fn add_multiple_words(&mut self, _words: &HashSet<String>){
127        let mut word_list: Vec<NewWord> = Vec::new();
128        _words.iter().map(|new_word| {
129            let word_to_add = NewWord {
130                word: new_word,
131                word_pattern: get_word_pattern(new_word),
132                language_id: self.language_id,
133            };
134            word_list.push(word_to_add);
135        }).for_each(drop);
136        diesel::insert_into(words::table)
137            .values(&word_list)
138            .execute(self.session())
139            .expect("Error saving new word.");
140    }
141
142    /// Remove given word from dictionary.
143    ///
144    /// If word is not already present at dictionary, do nothing.
145    ///
146    /// # Parameters:
147    /// * word: word to remove from dictionary.
148    pub fn remove_word<T>(&mut self, _word: T)
149        where T: AsRef<str> {
150        diesel::delete(words::table.filter(word.eq(_word.as_ref()).and(language_id.eq(&self.language_id))))
151            .execute(self.session())
152            .expect("Error deleting word");
153    }
154
155    /// Check if given word exists at this dictionary.
156    ///
157    /// # Parameters:
158    /// * word: word to check.
159    ///
160    /// # Returns:
161    /// True if word is already present at dictionary, False otherwise.
162    pub fn word_exists<T>(&self, _word: T) -> bool
163        where T: AsRef<str> {
164        let _word_clone = _word.as_ref().clone();
165        if let Ok(count) = words::table.filter(word.eq(_word.as_ref()).and(language_id.eq(&self.language_id)))
166            .count()
167            .first::<i64>(self.session()) {
168            if count > 0 {true} else {false}
169        } else {
170            false
171        }
172    }
173
174    /// Read a file's words and stores them at this language database.
175    ///
176    /// # Parameters:
177    /// * file_pathname: Absolute path to file with text to analyze.
178    pub fn populate<T>(&mut self, file_pathname: T)-> Result<()>
179        where T: AsRef<Path> {
180        let _words = get_words_from_text_file(file_pathname.as_ref())?;
181        self.add_multiple_words(&_words);
182        Ok(())
183    }
184
185    /// Check if a table for this instance language already exists at database or not.
186    ///
187    /// # Returns:
188    /// True if a table exists for this instance language, otherwise False.
189    fn already_created(&self)-> bool {
190        if let Ok(_) = languages::table.filter(language.eq(&self.language))
191            .select(languages::id)
192            .first::<i32>(self.session()) {
193            true
194        } else {
195            false
196        }
197    }
198
199    /// Create this instance language table in database.
200    fn create_dictionary(&mut self) {
201        let new_language = NewLanguage {language: self.language.as_str()};
202        diesel::insert_into(languages::table)
203            .values(&new_language)
204            .execute(self.session())
205            .expect("Error saving new language.");
206        self.language_id = languages::table.filter(language.eq(&self.language))
207            .select(languages::id)
208            .first::<i32>(self.session())
209            .expect("Error getting newly created language id.");
210    }
211
212    /// Get a list of every word with given pattern.
213    ///
214    /// # Parameters:
215    /// * pattern: Word patter to search for.
216    ///
217    /// # Returns:
218    /// * List of words at dictionary with given pattern.
219    pub fn get_words_with_pattern<T>(&self, pattern: T) -> Result<Vec<String>>
220        where T: AsRef<str> {
221        // words::table.filter(word.eq(_word.as_ref()).and(language_id.eq(&self.language_id)))
222        let words_result = words::table.filter(word_pattern.eq(pattern.as_ref()))
223            .select(word)
224            .get_results::<String>(self.session());
225        match words_result {
226            Ok(_words) => Ok(_words),
227            Err(e) => bail!(format!("{}",e))
228        }
229    }
230
231    /// Get how many words of given set are really present in this dictionary.
232    ///
233    /// # Parameters:
234    /// * words: Set of words.
235    ///
236    /// # Returns:
237    /// * A float between 0 and 1 being 1 as every word in set is present at dictionary.
238    pub fn get_words_presence(&self, _words: &HashSet<String>) -> f64 {
239        let total_words = _words.len();
240        let current_hits: usize = _words.iter()
241            .map(|_word| if self.word_exists(_word) { 1 } else { 0 })
242            .sum();
243        let presence: f64 = current_hits as f64 / total_words as f64;
244        presence
245    }
246
247    /// Get a list of every word present at dictionary.
248    pub fn get_all_words(&self) -> Result<Vec<String>> {
249        let words_result = words::table
250            .filter(language_id.eq(self.language_id))
251            .select(word)
252            .get_results::<String>(self.session());
253        match words_result {
254            Ok(_words) => Ok(_words),
255            Err(e) => bail!(format!("{}",e))
256        }
257    }
258}
259
260/// Get word pattern.
261///
262/// This pattern is useful to break substitution cipher.
263///
264/// # Parameters:
265/// * word: Word to get pattern for.
266///
267/// # Returns:
268/// * Word pattern.
269pub fn get_word_pattern<T>(_word: T) -> String 
270    where T: AsRef<str> {
271    let mut char_order = InsertionOrderedSet::new();
272    _word.as_ref().chars()
273        .for_each(|_char| {
274            char_order.insert(_char.to_string());
275        });
276    let chars_indexed: Vec<&String> = char_order.iter().collect();
277    let pattern: Vec<usize> = _word.as_ref().chars()
278        .map(|_char|
279            chars_indexed.iter().position(|&x|
280                x.as_str().to_string() == _char.to_string()))
281        .filter(|option|
282            match option {
283                None => false,
284                _ => true })
285        .map(|option|
286            match option {
287                Some(x) => x,
288                None => 0 // Never mind, iterator has been previously filtered to not to have Nones.
289            })
290        .collect();
291    let pattern_string = pattern.iter()
292        .map(|x| x.to_string())
293        .collect::<Vec<String>>()
294        .join(".");
295    pattern_string
296}
297
298/// Type to get a set like behaviour keeping track of insertion order.
299///
300/// Unlike Python, Rust has no built-in type to keep insertion ordering in a set like type
301/// so you must implement your own one.
302struct InsertionOrderedSet<T> {
303    elements: Vec<T>
304}
305
306impl<T> InsertionOrderedSet<T> {
307
308    /// Create a new InsertionOrderedSet instance.
309    pub fn new() -> Self {
310        Self {
311            elements: Vec::new()
312        }
313    }
314
315    /// Add new element to set.
316    ///
317    /// If element already exists in set, then it is not added.
318    ///
319    /// # Parameters:
320    /// * new_element: New element to add.
321    pub fn insert(&mut self, new_element: T)
322        where T: PartialEq {
323        if !self.contains(&new_element) {
324            self.elements.push(new_element);
325        }
326    }
327
328    /// Check if element already exists on set.
329    ///
330    /// # Parameters:
331    /// * element_to_find: elemento to look for into set.
332    ///
333    /// # Returns:
334    /// * True if element is already in set and false if not.
335    pub fn contains(&self, element_to_find: &T) -> bool
336        where T: PartialEq {
337        self.elements.contains(element_to_find)
338    }
339
340    pub fn iter(&self) -> InsertionOrderedSetIterator<T> {
341        InsertionOrderedSetIterator {
342            set: self,
343            index: 0
344        }
345    }
346}
347
348struct InsertionOrderedSetIterator<'a, T: 'a>{
349    set: &'a InsertionOrderedSet<T>,
350    index: usize
351}
352
353impl<'a, T> Iterator for InsertionOrderedSetIterator<'a, T> {
354    type Item = &'a T;
355
356    fn next(&mut self) -> Option<Self::Item> {
357        if let Some(value) = self.set.elements.get(self.index) {
358            self.index += 1;
359            Some(value)
360        } else {
361            None
362        }
363    }
364}
365
366/// Extract words from given file.
367///
368/// # Parameters:
369/// * param file_pathname: Absolute filename to file to be read.
370///
371/// # Returns:
372/// A set of words normalized to lowercase and without any punctuation mark.
373pub fn get_words_from_text_file<T>(file_pathname: T) -> Result<HashSet<String>>
374    where T: AsRef<Path> {
375    let mut file_content = String::new();
376    let mut file_to_read = File::open(file_pathname.as_ref())
377        .chain_err(|| ErrorKind::IOError(file_pathname.as_ref().to_string_lossy().to_string()))?;
378    file_to_read.read_to_string(&mut file_content)
379        .chain_err(|| ErrorKind::IOError(file_pathname.as_ref().to_string_lossy().to_string()))?;
380    let words_set = get_words_from_text(file_content);
381    Ok(words_set)
382}
383
384/// Extract words from given text.
385///
386/// Extracted words are normalized to lowercase and any punctuation mark
387/// adjacent to words are removed.
388///
389/// # Parameters:
390/// * text: Text to extract words from.
391///
392/// # Returns:
393/// A set of words normalized to lowercase and without any punctuation mark.
394pub fn get_words_from_text<T>(text: T)-> HashSet<String>
395    where T: AsRef<str> {
396    let words_list = normalize_text(text);
397    let words_set = HashSet::from_iter(words_list.iter().cloned());
398    words_set
399}
400
401/// Language selected as more likely to be the one the message is written into.
402///
403/// # Members:
404/// * winner: Name of language more likely. If None the no proper language was found.
405/// * winner_probability: Probability this language is actually de right one. If None the no proper language was found.
406/// * candidates: Dict with all languages probabilities. Probabilities are floats from 0 to 1.
407pub struct IdentifiedLanguage {
408    pub(crate) winner: Option<String>,
409    pub(crate) winner_probability: Option<f64>,
410    candidates: HashMap<String, f64>
411}
412
413/// Identify language used to write text.
414///
415/// It check each word present at text to find out if is present in any language.
416/// The language that has more words is select as winner.
417///
418/// # Parameters:
419/// * Text: Text to analyze.
420///
421/// # Returns:
422/// * Language selected as more likely to be the one used to write text.
423pub fn identify_language<T>(text: T)-> Result<IdentifiedLanguage>
424    where T: AsRef<str> {
425    let _words = get_words_from_text(&text);
426    let candidates = get_candidates_frecuency(&_words)?;
427    if let Some(winner) = get_winner(&candidates){
428        let winner_probability = *(candidates.get(winner.as_str()).unwrap());
429        Ok(IdentifiedLanguage {
430            winner: Some(winner),
431            winner_probability: Some(winner_probability),
432            candidates
433        })
434    } else {
435        Ok(IdentifiedLanguage {
436            winner: None,
437            winner_probability: None,
438            candidates
439        })
440    }
441}
442
443/// Get frequency of presence of words in each language.
444///
445/// # Parameters:
446/// * words: Text words.
447///
448/// # Returns:
449/// * Dict with all languages probabilities. Probabilities are floats
450///    from 0 to 1. The higher the frequency of presence of words in language
451///    the higher of this probability.
452fn get_candidates_frecuency(_words: &HashSet<String>)-> Result<HashMap<String, f64>> {
453    let total_words = _words.len();
454    let mut candidates: HashMap<String, f64> = HashMap::new();
455    for _language in Dictionary::get_dictionaries_names()? {
456        let dictionary = Dictionary::new(&_language, false)
457            .chain_err(|| ErrorKind::DatabaseError(String::from("Error opening language dictionary")))?;
458        let current_hits: u64 = _words.iter().map(|_word| if dictionary.word_exists(_word) {1} else {0}).sum();
459        let frequency = current_hits as f64 / total_words as f64;
460        candidates.insert(_language, frequency);
461    }
462    Ok(candidates)
463}
464
465/// Return candidate with highest frequency.
466///
467/// # Parameters:
468/// * candidates: Dict with all languages probabilities. Probabilities are floats
469///    from 0 to 1. The higher the frequency of presence of words in language
470///    the higher of this probability
471fn get_winner(candidates: &HashMap<String, f64>)-> Option<String> {
472    let mut current_winner = None;
473    let mut current_highest_frequency = 0_f64;
474    for (candidate_name, frequency) in candidates {
475        if *frequency > current_highest_frequency {
476            current_winner = Some(candidate_name.clone());
477            current_highest_frequency = *frequency;
478        }
479    }
480    current_winner
481}
482
483/// Assess a list of IdentifiedLanguage objects and select the most likely.
484///
485/// # Parameters:
486/// * identified_languages: A list of tuples with a Caesar key and its corresponding IdentifiedLanguage object.
487///
488/// # Returns:
489/// * Key whose IdentifiedLanguage object got the highest probability.
490pub fn get_best_result(identified_languages: &Vec<Result<(usize, IdentifiedLanguage)>>)-> usize {
491    let mut current_best_key: usize = 0;
492    let mut current_best_key_probability: f64 = 0.0;
493    for result in identified_languages {
494        if let Ok((current_key, identified_language)) = result {
495            if let Some(_) = identified_language.winner {
496                if let Some(winner_probability) = identified_language.winner_probability {
497                    if winner_probability > current_best_key_probability {
498                        current_best_key = *current_key;
499                        current_best_key_probability = winner_probability;
500                    }
501                }
502            }
503        }
504    }
505    current_best_key
506}
507
508// /// Error to alarm when you try to work with a Language that has not been created yet.
509// #[derive(Debug)]
510// pub struct NotExistingLanguage {
511//     language_tried: String
512// }
513//
514// impl NotExistingLanguage {
515//     pub fn new<T>(language_tried: T)-> Self
516//         where T: AsRef<str> {
517//         let _language = language_tried.as_ref().to_string();
518//         NotExistingLanguage{language_tried: _language }
519//     }
520// }
521//
522// impl Error for NotExistingLanguage {}
523//
524// impl Display for NotExistingLanguage {
525//     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
526//         write!(f, "Does not exist any dictionary for {} language", self.language_tried)
527//     }
528// }
529
530#[cfg(test)]
531pub mod tests {
532    /// IMPORTANT NOTE: Diesel uses an environment variable to store its database path. These tests
533    /// set that environment variable to point to temporal folder where to store test database. Problem
534    /// is that cargo test launch test concurrently so each test changes environment variable concurrently
535    /// and you suffer data races, making your tests fail. So, to make these tests work right you
536    /// should run cargo test with this environment variable set:
537    ///
538    /// RUST_TEST_THREADS=1
539    ///
540    /// This way cargo test run every test sequentially and there is no data race.
541    use super::*;
542    use std::fs::{create_dir, File, OpenOptions, read_to_string};
543    use std::env;
544    use test_common::fs::ops::{copy_files};
545    use test_common::fs::tmp::TestEnvironment;
546    use test_common::system::env::TemporalEnvironmentVariable;
547    use rstest::*;
548    use std::ffi::OsString;
549    use std::path::{Path, PathBuf};
550    use std::io::{Write, BufReader, Read};
551    use crate::attack::database;
552    use std::env::temp_dir;
553
554
555    const TEXT_FILE_NAME: &'static str = "text_to_load.txt";
556    const ENGLISH_TEXT_WITHOUT_PUNCTUATIONS_MARKS: &'static str = "This eBook is for the use of anyone anywhere at no cost and with
557almost no restrictions whatsoever You may copy it give it away or
558re use it under the terms of the Project Gutenberg License included
559with this eBook or online at";
560    pub const ENGLISH_TEXT_WITH_PUNCTUATIONS_MARKS: &'static str = "This eBook is for the use of anyone anywhere at no cost and with
561almost no restrictions whatsoever.You may copy it, give it away or
562re-use it under the terms of the Project Gutenberg License included
563with this eBook or online at 2020";
564    const SPANISH_TEXT_WITHOUT_PUNCTUATIONS_MARKS: &'static str = "Todavía lo recuerdo como si aquello hubiera sucedido ayer llegó á las
565puertas de la posada estudiando su aspecto afanosa y atentamente
566seguido por su maleta que alguien conducía tras él en una carretilla de
567mano Era un hombre alto fuerte pesado con un moreno pronunciado
568color de avellana Su trenza ó coleta alquitranada le caía sobre los
569hombros de su nada limpia blusa marina Sus manos callosas destrozadas
570y llenas de cicatrices enseñaban las extremidades de unas uñas rotas y
571negruzcas Y su rostro moreno llevaba en una mejilla aquella gran
572cicatriz de sable sucia y de un color blanquizco lívido y repugnante
573Todavía lo recuerdo paseando su mirada investigadora en torno del
574cobertizo silbando mientras examinaba y prorrumpiendo en seguida en
575aquella antigua canción marina que tan á menudo le oí cantar después";
576    const SPANISH_TEXT_WITH_PUNCTUATIONS_MARKS: &'static str = "Todavía lo recuerdo como si aquello hubiera sucedido ayer: llegó á las
577puertas de la posada estudiando su aspecto, afanosa y atentamente,
578seguido por su maleta que alguien conducía tras él en una carretilla de
579mano. Era un hombre alto, fuerte, pesado, con un moreno pronunciado,
580color de avellana. Su trenza ó coleta alquitranada le caía sobre los
581hombros de su nada limpia blusa marina. Sus manos callosas, destrozadas
582y llenas de cicatrices enseñaban las extremidades de unas uñas rotas y
583negruzcas. Y su rostro moreno llevaba en una mejilla aquella gran
584cicatriz de sable, sucia y de un color blanquizco, lívido y repugnante.
585Todavía lo recuerdo, paseando su mirada investigadora en torno del
586cobertizo, silbando mientras examinaba y prorrumpiendo, en seguida, en
587aquella antigua canción marina que tan á menudo le oí cantar después:";
588    const FRENCH_TEXT_WITHOUT_PUNCTUATIONS_MARKS: &'static str = "Combien le lecteur tandis que commodément assis au coin de son feu
589il s amuse à feuilleter les pages d un roman combien il se rend peu
590compte des fatigues et des angoisses de l auteur Combien il néglige de
591se représenter les longues nuits de luttes contre des phrases rétives
592les séances de recherches dans les bibliothèques les correspondances
593avec d érudits et illisibles professeurs allemands en un mot tout
594l énorme échafaudage que l auteur a édifié et puis démoli simplement
595pour lui procurer à lui lecteur quelques instants de distraction au
596coin de son feu ou encore pour lui tempérer l ennui d une heure en
597wagon";
598    const FRENCH_TEXT_WITH_PUNCTUATIONS_MARKS: &'static str = "Combien le lecteur,--tandis que, commodément assis au coin de son feu,
599il s'amuse à feuilleter les pages d'un roman,--combien il se rend peu
600compte des fatigues et des angoisses de l'auteur! Combien il néglige de
601se représenter les longues nuits de luttes contre des phrases rétives,
602les séances de recherches dans les bibliothèques, les correspondances
603avec d'érudits et illisibles professeurs allemands, en un mot tout
604l'énorme échafaudage que l'auteur a édifié et puis démoli, simplement
605pour lui procurer, à lui, lecteur, quelques instants de distraction au
606coin de son feu, ou encore pour lui tempérer l'ennui d'une heure en
607wagon!";
608    const GERMAN_TEXT_WITHOUT_PUNCTUATIONS_MARKS: &'static str = "Da unser Gutsherr Mr Trelawney Dr Livesay und die übrigen Herren
609mich baten alle Einzelheiten über die Schatzinsel von Anfang bis zu
610Ende aufzuschreiben und nichts auszulassen als die Lage der Insel und
611auch die nur weil noch ungehobene Schätze dort liegen nehme ich im
612Jahre die Feder zur Hand und beginne bei der Zeit als mein Vater
613noch den Gasthof Zum Admiral Benbow hielt und jener dunkle alte
614Seemann mit dem Säbelhieb über der Wange unter unserem Dache Wohnung
615nahm";
616    const GERMAN_TEXT_WITH_PUNCTUATIONS_MARKS: &'static str = "Da unser Gutsherr, Mr. Trelawney, Dr. Livesay und die übrigen Herren
617mich baten, alle Einzelheiten über die Schatzinsel von Anfang bis zu
618Ende aufzuschreiben und nichts auszulassen als die Lage der Insel, und
619auch die nur, weil noch ungehobene Schätze dort liegen, nehme ich im
620Jahre 17.. die Feder zur Hand und beginne bei der Zeit, als mein Vater
621noch den Gasthof „Zum Admiral Benbow“ hielt und jener dunkle, alte
622Seemann mit dem Säbelhieb über der Wange unter unserem Dache Wohnung
623nahm.";
624
625    const LANGUAGES: [&'static str; 4] = ["english", "spanish", "french", "german"];
626
627    pub struct MicroDictionaries {
628        pub(crate) _languages: HashMap<String, Vec<String>>
629    }
630
631    impl MicroDictionaries {
632        pub fn new() -> Self {
633            let mut _languages: HashMap<String, Vec<String>> = HashMap::new();
634            _languages.insert("english".to_string(), vec!["yes".to_string(),
635                                                         "no".to_string(),
636                                                         "dog".to_string(),
637                                                         "cat".to_string(),
638                                                         "snake".to_string()]);
639            _languages.insert("spanish".to_string(), vec!["si".to_string(),
640                                                         "no".to_string(),
641                                                         "perro".to_string(),
642                                                         "gato".to_string()]);
643            _languages.insert("french".to_string(), vec!["qui".to_string(),
644                                                         "non".to_string(),
645                                                         "chien".to_string(),
646                                                         "chat".to_string()]);
647            _languages.insert("german".to_string(), vec!["ja".to_string(),
648                                                         "nein".to_string(),
649                                                         "hund".to_string(),
650                                                         "katze".to_string()]);
651            MicroDictionaries{_languages}
652        }
653    }
654
655    /// Class with info to use a temporary dictionaries database.
656    pub struct LoadedDictionaries {
657        pub temp_dir: PathBuf,
658        pub languages: Vec<String>,
659        temp_env: TestEnvironment,
660        temp_env_var: TemporalEnvironmentVariable
661    }
662
663    impl LoadedDictionaries {
664        pub fn new()-> Self {
665            let (temp_env, temp_env_var) = temporary_database_folder(None);
666            database::create_database();
667            let temp_dir = temp_env.path().to_owned();
668            let mut resources_path = temp_dir.clone();
669            resources_path.push("resources");
670            create_dir(&resources_path);
671            let mut source_path = env::current_dir()
672                .expect("Could not get current working dir");
673            source_path.push("resources");
674            copy_files(LANGUAGES.iter()
675                .map(|x| format!("{}/{}_book.txt", source_path.to_str().expect("Path contains non unicode characters"), x))
676                .collect(),
677                       resources_path.as_path().as_os_str().to_str()
678                           .expect("Path contains not unicode characters."))
679                .expect("Error copying books to temporal folder.");
680            for _language in LANGUAGES.iter() {
681                let mut dictionary = Dictionary::new(_language, true)
682                    .expect(format!("No dictionary found for {} language.", _language).as_str());
683                let mut language_book = resources_path.clone();
684                language_book.push(format!("{}_book.txt", _language));
685                dictionary.populate(language_book);
686            }
687            let mut _languages = Vec::new();
688            LANGUAGES.iter().map(|x| _languages.push(x.to_string())).collect::<Vec<_>>();
689            LoadedDictionaries{
690                temp_dir,
691                languages: _languages,
692                temp_env,
693                temp_env_var
694            }
695        }
696    }
697
698    /// Used only as a fixture for tests.
699    #[fixture]
700    pub fn full_loaded_temp_dictionaries()-> LoadedDictionaries {
701        LoadedDictionaries::new()
702    }
703
704    /// Get tuples with a language name, a text with punctuations marks and a text without it.
705    fn get_text_tuples()-> Vec<(&'static str, &'static str, &'static str)> {
706        vec![
707            ("english", ENGLISH_TEXT_WITH_PUNCTUATIONS_MARKS, ENGLISH_TEXT_WITHOUT_PUNCTUATIONS_MARKS),
708            ("spanish", SPANISH_TEXT_WITH_PUNCTUATIONS_MARKS, SPANISH_TEXT_WITHOUT_PUNCTUATIONS_MARKS),
709            ("french", FRENCH_TEXT_WITH_PUNCTUATIONS_MARKS, FRENCH_TEXT_WITHOUT_PUNCTUATIONS_MARKS),
710            ("german", GERMAN_TEXT_WITH_PUNCTUATIONS_MARKS, GERMAN_TEXT_WITHOUT_PUNCTUATIONS_MARKS)]
711    }
712
713    /// Get a HashMap with languages as keys and a list of words for every language.
714    fn get_micro_dictionaries_content() -> HashMap<&'static str, Vec<String>>{
715        let mut micro_dictionaries: HashMap<&'static str, Vec<String>> = HashMap::new();
716        micro_dictionaries.insert("english", vec!("yes".to_string(), "no".to_string(), "dog".to_string(), "cat".to_string(), "snake".to_string()));
717        micro_dictionaries.insert("spanish", vec!("si".to_string(), "no".to_string(), "perro".to_string(), "gato".to_string()));
718        micro_dictionaries.insert("french", vec!("qui".to_string(), "non".to_string(), "chien".to_string(), "chat".to_string()));
719        micro_dictionaries.insert("german", vec!("ja".to_string(), "nein".to_string(), "hund".to_string(), "katze".to_string()));
720        micro_dictionaries
721    }
722
723
724    /// Create a dictionary at a temp dir filled with only a handful of words.
725    ///
726    /// # Returns:
727    /// Yields created temp_dir to host temporal dictionary database.
728    #[fixture]
729    pub fn loaded_micro_dictionary_temp_dir() -> (TestEnvironment, TemporalEnvironmentVariable) {
730        let (temp_env, temp_env_database_path) = temporary_database_folder(None);
731        database::create_database();
732        let micro_dictionaries= get_micro_dictionaries_content();
733        // let temp_env = TestEnvironment::new();
734        for (_language, _words) in &micro_dictionaries {
735            let mut language_dictionary = Dictionary::new(_language, true)
736                .expect(format!("Dictionary not found for {} language", _language).as_str());
737            _words.iter().map(|_word| language_dictionary.add_word(_word)).collect::<Vec<_>>();
738        }
739        for (_language, _words) in micro_dictionaries {
740            let language_dictionary = Dictionary::new(_language, false)
741                .expect(format!("Dictionary not found for {} language", _language).as_str());
742            assert!(_words.iter().all(|_word| language_dictionary.word_exists(_word)));
743        }
744        (temp_env, temp_env_database_path)
745    }
746
747    /// File with denormalized text in a temporary path.
748    ///
749    /// Language name this text is written is is at its *language_name* attributte, while
750    /// its *normalized_text* has the normalized version.
751    struct TemporaryTextFile {
752        pub text_file: File,
753        pub normalized_text: String,
754        pub language_name: String,
755        pub temp_filename: PathBuf
756    }
757
758    impl TemporaryTextFile {
759        pub fn new<T, U, V, W>(temp_dir: T, text: U, normalized_text: V, language_name: W)-> Self
760            where T: AsRef<Path>,
761                  U: AsRef<str>,
762                  V: AsRef<str>,
763                  W: AsRef<str> {
764            let mut temporary_text_file_pathname = PathBuf::from(temp_dir.as_ref().as_os_str());
765            temporary_text_file_pathname.push(TEXT_FILE_NAME);
766            let mut text_file = OpenOptions::new()
767                                            .write(true)
768                                            .create(true)
769                                            .truncate(true)
770                                            .open(&temporary_text_file_pathname)
771                .expect("Error opening temporary text file for writing into it.");
772            text_file.write_all(text.as_ref().as_bytes());
773            TemporaryTextFile {
774                text_file,
775                normalized_text: normalized_text.as_ref().to_string(),
776                language_name: language_name.as_ref().to_string(),
777                temp_filename: temporary_text_file_pathname
778            }
779        }
780    }
781
782    impl AsRef<Path> for TemporaryTextFile {
783        fn as_ref(&self) -> &Path {
784            self.temp_filename.as_path()
785        }
786    }
787
788
789    /// Creates a temporary folder and set that folder as database home.
790    ///
791    /// # Returns:
792    /// You may not use then, but keep them in scope or temp folder will be removed
793    /// and environment var to find database will be restored to its default value.
794    fn temporary_database_folder(temp_dir: Option<TestEnvironment>)-> (TestEnvironment, TemporalEnvironmentVariable){
795        let temp_dir = match temp_dir {
796            None => TestEnvironment::new(),
797            Some(test_env) => test_env
798        };
799        let mut temp_database_path = PathBuf::from(temp_dir.path());
800        temp_database_path.push("cifra_database.sqlite");
801        let temp_env_database_path = TemporalEnvironmentVariable::new(database::DATABASE_ENV_VAR,
802                                                                      temp_database_path.as_os_str().to_str()
803                                                                          .expect("Path contains non unicode chars."));
804        (temp_dir, temp_env_database_path)
805    }
806
807    #[test]
808    fn test_open_not_existing_dictionary() {
809        let (temp_dir, temp_env_database_path) = temporary_database_folder(None);
810        match Dictionary::new("english", false) {
811            Ok(_)=> assert!(false),
812            Err(_)=> assert!(true)
813        }
814    }
815
816    #[test]
817    fn test_open_existing_dictionary() {
818        let (temp_dir, temp_env_database_path) = temporary_database_folder(None);
819        database::create_database();
820        // Create not existing language.
821        {
822            Dictionary::new("english", true);
823        }
824        // Open newly created language.
825        {
826            let english_dictionary = Dictionary::new("english", false)
827                .expect("Error opening dictionary.");
828            assert!(english_dictionary.already_created());
829        }
830    }
831
832    #[test]
833    /// Test if we can check for word existence, write a new word and finally delete it.
834    fn test_cwd_word() {
835        let (temp_dir, temp_env_database_path) = temporary_database_folder(None);
836        database::create_database();
837        let _word = "test";
838        let mut english_dictionary = Dictionary::new("english", true)
839            .expect("Error opening dictionary");
840        assert!(!english_dictionary.word_exists(_word));
841        english_dictionary.add_word(_word);
842        assert!(english_dictionary.word_exists(_word));
843        english_dictionary.remove_word(_word);
844        assert!(!english_dictionary.word_exists(_word));
845    }
846
847    #[test]
848    /// Test a new language creation at database.
849    fn test_create_language() {
850        let (temp_dir, temp_env_database_path) = temporary_database_folder(None);
851        let mut english_dictionary = Dictionary {
852            language: "english".to_string(),
853            language_id: 0,
854            database: database::create_database().expect("Error creating database")
855        };
856        assert!(!english_dictionary.already_created());
857        english_dictionary.create_dictionary();
858        assert!(english_dictionary.already_created());
859    }
860
861    #[test]
862    /// Test delete a language also removes its words.
863    fn test_delete_language() {
864        let mut micro_dictionaries = get_micro_dictionaries_content();
865        let (temp_dir, temp_env_database_path) = loaded_micro_dictionary_temp_dir();
866        let language_to_remove = "german";
867        Dictionary::remove_dictionary(language_to_remove);
868        // Check all words from removed language have been removed too.
869        let not_existing_dictionary = Dictionary {
870            language: language_to_remove.to_string(),
871            language_id: 0,
872            database: database::create_database().expect("Error creating database")
873        };
874        let micro_dictionary = micro_dictionaries.get(language_to_remove)
875            .expect("Error opening dictionary to be removed");
876        assert!(micro_dictionary.iter().all(|_word| !not_existing_dictionary.word_exists(_word)));
877    }
878
879    #[test]
880    fn test_get_words_from_text_file() {
881        let temp_dir = TestEnvironment::new();
882        let text_tuples = get_text_tuples();
883        for (language_name, text_with_puntuation_marks, text_without_punctuation_marks) in text_tuples {
884            let temporary_text = TemporaryTextFile::new(&temp_dir,
885                                                        text_with_puntuation_marks,
886                                                        text_without_punctuation_marks,
887                                                        language_name);
888            let mut expected_set = HashSet::new();
889            temporary_text.normalized_text.to_lowercase().split_ascii_whitespace().map(|_word| expected_set.insert(_word.to_string())).collect::<Vec<_>>();
890            let returned_set = get_words_from_text_file(temporary_text.temp_filename)
891                .expect("Error reading text file");
892            let mut diff: Vec<String> = Vec::new();
893            for x in returned_set.symmetric_difference(&expected_set){
894                diff.push(x.clone());
895            }
896            assert_eq!(expected_set, returned_set);
897        }
898    }
899
900    #[test]
901    fn test_populate_words_from_text_files() {
902        let (temp_dir, temp_env_database_path) = temporary_database_folder(None);
903        database::create_database();
904        let temporary_text_file = TemporaryTextFile::new(&temp_dir,
905                                                         ENGLISH_TEXT_WITH_PUNCTUATIONS_MARKS,
906                                                         ENGLISH_TEXT_WITHOUT_PUNCTUATIONS_MARKS,
907                                                         "english");
908        let mut expected_set: HashSet<&str> = HashSet::new();
909        let expected_file_content = temporary_text_file.normalized_text;
910        let expected_lowercase_content = expected_file_content.to_lowercase();
911        expected_lowercase_content.split_ascii_whitespace().map(|x| expected_set.insert(x)).collect::<Vec<_>>();
912        {
913            let mut dictionary = Dictionary::new(&temporary_text_file.language_name, true)
914                .expect("Error opening dictionary");
915            dictionary.populate(temporary_text_file.temp_filename.as_path());
916        }
917        {
918            let dictionary = Dictionary::new(&temporary_text_file.language_name, false)
919                .expect("Error opening dictionary");
920            assert!(expected_set.iter().all(|_word| dictionary.word_exists(_word)));
921        }
922    }
923
924    #[test]
925    fn test_get_words_from_text() {
926        let test_tuples = get_text_tuples();
927        for test_tuple in test_tuples {
928            let mut expected_set = HashSet::new();
929            test_tuple.2.to_lowercase().split_ascii_whitespace().map(|_word| expected_set.insert(_word.to_string())).collect::<Vec<_>>();
930            let returned_set = get_words_from_text(test_tuple.1);
931            assert_eq!(expected_set, returned_set);
932        }
933    }
934    
935    #[test]
936    fn test_get_dictionaries_names() {
937        let loaded_dictionaries = LoadedDictionaries::new();
938        let dictionaries_names = Dictionary::get_dictionaries_names().expect("Error getting dictionaries names.");
939        assert_eq!(dictionaries_names, loaded_dictionaries.languages)
940    }
941
942    #[test]
943    fn test_add_multiple_words() {
944        let (temp_dir, temp_env_database_path) = temporary_database_folder(None);
945        database::create_database();
946        let _language = "english";
947        let micro_dictionaries = get_micro_dictionaries_content();
948        let mut words_to_add: HashSet<String> = HashSet::new();
949        micro_dictionaries[_language].iter().map(|_word| words_to_add.insert(_word.clone())).collect::<Vec<_>>();
950        let mut dictionary = Dictionary::new(_language, true)
951            .expect("Error opening dictionary.");
952        assert!(!micro_dictionaries[_language].iter().all(|_word| dictionary.word_exists(_word)));
953        dictionary.add_multiple_words(&words_to_add);
954        assert!(micro_dictionaries[_language].iter().all(|_word| dictionary.word_exists(_word)));
955    }
956
957    #[test]
958    fn test_identify_language() {
959        let loaded_dictionaries = LoadedDictionaries::new();
960        let test_cases = vec![(ENGLISH_TEXT_WITH_PUNCTUATIONS_MARKS, "english"),
961                              (SPANISH_TEXT_WITH_PUNCTUATIONS_MARKS, "spanish")];
962        for (text, expected_language) in test_cases{
963            let identified_language = identify_language(text).expect("Error identifying language.");
964            if let Some(winner) = identified_language.winner {
965                assert_eq!(winner, expected_language, "Language not correctly identified.");
966            } else {
967                assert!(false, "Language not identified")
968            }
969            if let Some(winner_probability) = identified_language.winner_probability {
970                assert_eq!(winner_probability, 1.0, "Language probability incorrectly calculated.");
971            } else {
972                assert!(false, "Language probability not found.")
973            }
974        }
975    }
976
977    #[test]
978    fn test_get_word_pattern() {
979        let _word = "HGHHU";
980        let expected_word_pattern = "0.1.0.0.2";
981        let _word_pattern = get_word_pattern(_word);
982        assert_eq!(_word_pattern.as_str(), expected_word_pattern,
983                   "Obtained pattern {} is not what we were waiting for {}.",
984                    _word_pattern.as_str(), expected_word_pattern );
985    }
986
987    #[test]
988    fn test_store_word_pattern() {
989        let _word = "classification";
990        let (temp_dir, temp_env_database_path) = temporary_database_folder(None);
991        database::create_database();
992        if let Ok(mut test_dictionary) = Dictionary::new("test", true) {
993            assert!(!test_dictionary.word_exists(_word));
994            test_dictionary.add_word(_word);
995            assert!(test_dictionary.word_exists(_word));
996            let _words = test_dictionary.get_words_with_pattern("0.1.2.3.3.4.5.4.0.2.6.4.7.8").expect("No word found with that pattern");
997            assert!(_words.contains(&_word.to_string()));
998        } else {
999            assert!(false, "Could not create dictionary.")
1000        }
1001
1002    }
1003
1004    #[test]
1005    fn test_insertion_ordered_set() {
1006        let expected_list = vec!["A".to_string(), "B".to_string(), "C".to_string()];
1007        let mut set: InsertionOrderedSet<String> = InsertionOrderedSet::new();
1008        set.insert("A".to_string());
1009        set.insert("B".to_string());
1010        set.insert("C".to_string());
1011        // Now a repeated char.
1012        set.insert("B".to_string());
1013        let recovered_list: Vec<String> = set.iter().cloned().collect();
1014        assert_eq!(recovered_list, expected_list,
1015            "Recovered list {:?} but we were expecting {:?}",
1016            recovered_list, expected_list);
1017    }
1018
1019    #[rstest]
1020    fn test_get_all_words(loaded_micro_dictionary_temp_dir: (TestEnvironment, TemporalEnvironmentVariable)) {
1021        let expected_words: HashSet<String> = HashSet::from_iter(vec!["yes".to_string(),
1022                                                     "no".to_string(),
1023                                                     "dog".to_string(),
1024                                                     "cat".to_string(), "snake".to_string()]);
1025        let dictionary = Dictionary::new("english", false).unwrap();
1026        let returned_words = dictionary.get_all_words().unwrap();
1027        let returned_words_set = HashSet:: from_iter(returned_words);
1028        assert_eq!(returned_words_set, expected_words)
1029    }
1030}