1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394
//! # Text_Analysis //! Analyze text stored as *.txt or *pdf in provided file or directory. Doesn't read files in subdirectories. //! Counting all words and then searching for every unique word in the vicinity (+-5 words). //! Stores results in file [date/time]results_word_analysis.txt in given directory. //! ## Usage: ```text_analysis path/to/directory_or_file``` //! # Example //! ``` //! use text_analysis::{count_words, save_file, sort_map_to_vec, trim_to_words, words_near}; //! use std::collections::HashMap; //! //! //Create an example string. Would normally be read from file. Words constisting of just one char will be ingnored. //! let content_string: String = "An example phrase including two times the word two".to_string(); //! let content_vec: Vec<String> = trim_to_words(content_string).unwrap(); //! //! //Count frequency in HashMap and sort HashMap to Vec according to frequency //! let word_frequency = count_words(&content_vec).unwrap(); //! let words_sorted = sort_map_to_vec(word_frequency).unwrap(); //! //! //Search for words +-5 near each unique word, count them and insert in Hashmap //! let mut index_rang: usize = 0; //! let mut words_near_map: HashMap<String, HashMap<String, u32>> = HashMap::new(); //! for word in &words_sorted { //! words_near_map.extend(words_near(&word, index_rang, &content_vec, &words_sorted).unwrap()); //! index_rang += 1; //! } //! //! //prepare output as String. Afterwards you may e.g. write this String to a file. //! let mut result_as_string = String::new(); //! //! //fill the String with word, frequency, words near //! for word in words_sorted { //! let (word_only, frequency) = &word; //! let words_near = &words_near_map[word_only]; //! let combined = format!( //! "Word: {:?}, Frequency: {:?},\nWords near: {:?} \n\n", //! word_only, //! frequency, //! sort_map_to_vec(words_near.to_owned()).unwrap() //! ); //! result_as_string.push_str(&combined); //! } //! //! //print resulting String //! println!("{:?}", result_as_string); //! //! ``` use std::collections::HashMap; use std::fs::OpenOptions; use std::io::prelude::*; use std::path::PathBuf; use chrono::prelude::*; use rayon::prelude::*; ///Search for words +-5 around given word. Returns result. /// # Example /// ``` /// use std::{collections::HashMap, hash::Hash}; /// use text_analysis::{count_words, sort_map_to_vec, words_near}; /// /// //function to allow uncomplete comparison of 2 HashMaps /// fn keys_match<T: Eq + Hash, U, V>(map1: &HashMap<T, U>, map2: &HashMap<T, V>) -> bool { /// map1.len() == map2.len() && map1.keys().all(|k| map2.contains_key(k)) /// } /// /// //create Vec<Strings>. Would be normally read from file as String and then "trim_to_words(content_of_file_as_string)" to obtain Vec<String> /// let words = vec![ /// "one".to_string(), /// "two".to_string(), /// "three".to_string(), /// "four".to_string(), /// "four".to_string(), /// "five".to_string(), /// ]; /// /// //define word we will be searching for other words near /// let word = ("two".to_string(), 2 as u32); /// /// //create sorted Vector of words according to their frequency /// let words_sorted: Vec<(String, u32)> = /// sort_map_to_vec(count_words(&words).unwrap()).unwrap(); /// let index_rang: usize = 1; /// /// //do the actual search for words +-5 near the word "two" /// let words_near_map = words_near(&word, index_rang, &words, &words_sorted).unwrap(); /// /// //create the expected result to compare to words_near_map /// let mut hashmap_inner = HashMap::new(); /// hashmap_inner.insert("four".to_string(), 2 as u32); /// hashmap_inner.insert("one".to_string(), 1 as u32); /// hashmap_inner.insert("three".to_string(), 1 as u32); /// hashmap_inner.insert("five".to_string(), 1 as u32); /// let mut expected_map = HashMap::new(); /// /// expected_map.insert("two".to_string(), hashmap_inner); /// assert!(keys_match(&words_near_map, &expected_map)); /// ``` pub fn words_near( word: &(String, u32), index_rang: usize, content_vec: &Vec<String>, words_sorted: &Vec<(String, u32)>, ) -> std::io::Result<HashMap<String, HashMap<String, u32>>> { let index: Vec<usize> = positions(&content_vec, &words_sorted[index_rang].0); let mut vec_word = Vec::new(); let max_len = content_vec.len(); for index_single in index { for i in 0..max_len { let min: usize = get_index_min(&index_single)?; let max: usize = get_index_max(&index_single, &max_len)?; if i >= min && i <= max && i != index_single { vec_word.push(content_vec[i].clone()); } else { }; } } let words_near: Vec<(String, u32)> = sort_map_to_vec(count_words(&vec_word)?)?; let mut words_near_map: HashMap<String, HashMap<String, u32>> = HashMap::new(); words_near_map.insert(word.0.to_owned(), words_near.into_par_iter().collect()); //println!("insert word: {:?}, map: {:?}", word, words_near); Ok(words_near_map) } ///Search for all position (usize) of word in given Vector<String> /// # Example /// ``` /// #[test] /// fn test() { /// use std::{collections::HashMap, hash::Hash}; /// use text_analysis::{positions}; /// let words = vec![ /// "one".to_string(), /// "two".to_string(), /// "three".to_string(), /// "four".to_string(), /// "four".to_string(), /// "five".to_string(), /// ]; /// let word = "four".to_string(); /// let position = positions(&words, &word); /// let expected = vec![3,4]; /// assert_eq!(position, expected); /// } /// ``` fn positions(vector: &Vec<String>, target: &String) -> Vec<usize> { let mut res = Vec::new(); for (index, c) in vector.into_iter().enumerate() { if &c == &target { res.push(index) } } res } ///Count words included in given &Vec<String>. Returns result as HashMap with <Word as String, Count as u32>. Returns result. /// # Example /// ``` /// use text_analysis::count_words; /// use std::collections::HashMap; /// let words = vec!["one".to_string(),"two".to_string(),"two".to_string(),"three".to_string(),"three".to_string(),"three".to_string(),]; /// let counted = count_words(&words).unwrap(); /// let mut words_map = HashMap::new(); /// words_map.insert("one".to_string(), 1 as u32); /// words_map.insert("two".to_string(), 2 as u32); /// words_map.insert("three".to_string(), 3 as u32); /// assert_eq!(counted, words_map); /// ``` pub fn count_words(words: &Vec<String>) -> std::io::Result<HashMap<String, u32>> { let mut frequency: HashMap<String, u32> = HashMap::new(); for word in words { //ignore words constiting of only one char if word.len() > 1 { *frequency.entry(word.to_owned()).or_insert(0) += 1; } } Ok(frequency) } ///Sort words in HashMap<Word, Frequency> according to frequency into Vector. Returns result. /// # Example /// ``` /// use text_analysis::sort_map_to_vec; /// use std::collections::HashMap; /// let mut words_map = HashMap::new(); /// words_map.insert("one".to_string(), 1 as u32); /// words_map.insert("two".to_string(), 2 as u32); /// words_map.insert("three".to_string(), 3 as u32); /// let vec_sorted = sort_map_to_vec(words_map).unwrap(); /// let expected = vec![("three".to_string(), 3 as u32), ("two".to_string(), 2 as u32), ("one".to_string(), 1 as u32)]; /// assert_eq!(vec_sorted, expected); /// ``` pub fn sort_map_to_vec(frequency: HashMap<String, u32>) -> std::io::Result<Vec<(String, u32)>> { let mut vec_sorted: Vec<(String, u32)> = frequency.into_par_iter().collect(); vec_sorted.par_sort_by(|a, b| b.1.cmp(&a.1)); Ok(vec_sorted) } ///Splits content of file into singe words as Vector<String>. Returns result. /// # Example /// ``` /// #[test] /// fn test() { /// use text_analysis::trim_to_words; /// let words = "(_test] {test2!=".to_string(); /// let trimmed = trim_to_words(words).unwrap(); /// let expected = vec!["test".to_string(), "test2".to_string()]; /// assert_eq!(trimmed, expected); /// } /// ``` pub fn trim_to_words(content: String) -> std::io::Result<Vec<String>> { let content: Vec<String> = content .to_lowercase() .replace(&['-'][..], " ") .replace( &[ '(', ')', ',', '\"', '.', ';', ':', '=', '[', ']', '{', '}', '-', '_', '/', '\'', '’', '?', '!', '“', '‘', ][..], "", ) .split_whitespace() .map(String::from) .collect::<Vec<String>>(); Ok(content) } ///Get mininum index. /// # Example /// ``` ///#[test] ///fn test() { ///use text_analysis::get_index_min; ///let index1 = 5; ///let min_index1 = get_index_min(&index1).unwrap(); ///assert_eq!(min_index1, 0); ///} /// ``` fn get_index_min(index: &usize) -> std::io::Result<usize> { let min = if *index == 4 { index - 4 } else if *index == 3 { index - 3 } else if *index == 2 { index - 2 } else if *index == 1 { index - 1 } else if *index == 0 { 0 } else { index - 5 }; Ok(min) } ///Get maximum index. /// # Example /// ``` /// #[test] /// fn test() { /// use text_analysis::get_index_max; /// let index1 = 5; /// let max_index1 = get_index_max(&index1, &9).unwrap(); /// assert_eq!(max_index1, 9); /// } /// ``` fn get_index_max(index: &usize, max_len: &usize) -> std::io::Result<usize> { let max = if index + 5 > *max_len { *max_len as usize } else { index + 5 }; Ok(max) } ///save file to path. Return result. pub fn save_file(to_file: String, mut path: PathBuf) -> std::io::Result<()> { let local: DateTime<Local> = Local::now(); let new_filename: String = local .format("%Y_%m_%d_%H_%M_%S_results_word_analysis.txt") .to_string(); path.push(new_filename); let mut file = OpenOptions::new().write(true).create(true).open(path)?; file.write_all(to_file.as_bytes())?; Ok(()) } #[cfg(test)] mod tests { use super::*; #[test] fn test_count() { let words = vec![ "one".to_string(), "two".to_string(), "two".to_string(), "three".to_string(), "three".to_string(), "three".to_string(), ]; let counted = count_words(&words).unwrap(); let mut words_map = HashMap::new(); words_map.insert("one".to_string(), 1 as u32); words_map.insert("two".to_string(), 2 as u32); words_map.insert("three".to_string(), 3 as u32); assert_eq!(counted, words_map); } #[test] fn test_max_min_index() { let index1 = 5; let min_index1 = get_index_min(&index1).unwrap(); let max_index1 = get_index_max(&index1, &9).unwrap(); assert_eq!(min_index1, 0); assert_eq!(max_index1, 9); let index2 = 0; let min_index2 = get_index_min(&index2).unwrap(); let max_index2 = get_index_max(&index2, &5).unwrap(); assert_eq!(min_index2, 0); assert_eq!(max_index2, 5); let index3 = 100; let min_index3 = get_index_min(&index3).unwrap(); let max_index3 = get_index_max(&index3, &103).unwrap(); assert_eq!(min_index3, 95); assert_eq!(max_index3, 103); } #[test] fn test_words_near() { use std::{collections::HashMap, hash::Hash}; fn keys_match<T: Eq + Hash, U, V>(map1: &HashMap<T, U>, map2: &HashMap<T, V>) -> bool { map1.len() == map2.len() && map1.keys().all(|k| map2.contains_key(k)) } let words = vec![ "one".to_string(), "two".to_string(), "three".to_string(), "four".to_string(), "four".to_string(), "five".to_string(), ]; let word = ("two".to_string(), 2 as u32); let words_sorted: Vec<(String, u32)> = sort_map_to_vec(count_words(&words).unwrap()).unwrap(); let index_rang: usize = 1; let words_near_map = words_near(&word, index_rang, &words, &words_sorted).unwrap(); let mut hashmap_inner = HashMap::new(); hashmap_inner.insert("four".to_string(), 2 as u32); hashmap_inner.insert("one".to_string(), 1 as u32); hashmap_inner.insert("three".to_string(), 1 as u32); hashmap_inner.insert("five".to_string(), 1 as u32); let mut expected_map = HashMap::new(); expected_map.insert("two".to_string(), hashmap_inner); assert!(keys_match(&words_near_map, &expected_map)); } #[test] fn example_test() { let content_string: String = "An example phrase including two times the word two".to_string(); let content_vec: Vec<String> = trim_to_words(content_string).unwrap(); let word_frequency = count_words(&content_vec).unwrap(); let words_sorted = sort_map_to_vec(word_frequency).unwrap(); let mut index_rang: usize = 0; let mut words_near_map: HashMap<String, HashMap<String, u32>> = HashMap::new(); for word in &words_sorted { words_near_map .extend(words_near(&word, index_rang, &content_vec, &words_sorted).unwrap()); index_rang += 1; } let mut result_as_string = String::new(); for word in words_sorted { let (word_only, frequency) = &word; let words_near = &words_near_map[word_only]; let combined = format!( "Word: {:?}, Frequency: {:?},\nWords near: {:?} \n\n", word_only, frequency, sort_map_to_vec(words_near.to_owned()).unwrap() ); result_as_string.push_str(&combined); } } }