1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
//! Fast and memory efficient Markov Chain implementation, optimized for text generation
//!
//! Features
//! --------
//!
//! - User can specify what regex they want to use for tokenization.
//! - MarkovChain::generate and MarkovChain::generate_start functions both take RngCore instead of using rand::thread_rng().
//! - Strings are interned for faster training and less memory usage.
//! - Serialization and deserialization with [serde](https://docs.rs/serde/latest/serde/), when `serialize` feature flag is used.
//!
//! Example
//! -------
//!
//! ```rust
//! use markov_str::*;
//! use rand::SeedableRng;
//! use regex::Regex;
//! use std::fs::{self, read_to_string};
//!
//! let training_path = "data";
//!
//! // Gets the paths of evey file and directory in the training_path.
//! let tpaths = fs::read_dir(training_path)
//! .unwrap_or_else(|_| panic!("Can't read files from: {}", training_path));
//!
//! // Only the files remain
//! let files = tpaths
//! .filter_map(|f| f.ok())
//! .filter(|f| match f.file_type() {
//! Err(_) => false,
//! Ok(f) => f.is_file(),
//! });
//!
//! // Reads every file into a string
//! let contents = files.filter_map(|f| read_to_string(f.path()).ok());
//!
//! // Creating the Markov Chain
//! let markov_chain = contents.fold(
//! MarkovChain::with_capacity(2, 8_000_000, Regex::new(WORD_REGEX).unwrap()),
//! |mut a, s| {
//! a.add_text(&s);
//! a
//! },
//! );
//!
//! // Number of tokens
//! println!("{}", markov_chain.len());
//!
//! // Generation
//! for _ in 0..10 {
//! println!("{}", markov_chain.generate_start("among the ", 25, &mut rand::thread_rng()).unwrap());
//! }
//! // Generation
//! println!("{}", markov_chain.len());
//!
//! // ThreadRng
//! for _ in 0..10 {
//! println!(
//! "ThreadRng: {}",
//! markov_chain
//! .generate_start("among the ", 25, &mut rand::thread_rng())
//! .unwrap()
//! );
//! }
//!
//! // StdRng with seed
//! let mut rng = rand::rngs::StdRng::seed_from_u64(1337);
//! for _ in 0..10 {
//! println!(
//! "Seeded: {}",
//! markov_chain
//! .generate_start("among the ", 25, &mut rng)
//! .unwrap()
//! );
//! }
//!
//! // Cloned
//! let mut rng = rand::rngs::StdRng::seed_from_u64(1337);
//! let m: MarkovChain = markov_chain.clone();
//! for _ in 0..10 {
//! println!(
//! "Cloned: {}",
//! m.generate_start("among the ", 25, &mut rng).unwrap()
//! );
//! }
//! ```
//!
//! This example is taken from the `examples/main.rs`, you can run it by:
//! ```ignore
//! ./get_data.sh
//! cargo run --release --example=main
//! ```
//!
//! `./get_data.sh` will download the first 200 books from [Project Gutenberg](https://www.gutenberg.org/), which totals up to more than 100MBs of text.
//!
//! License
//! -------
//!
//! markov_str is licensed under the MIT license. Feel free to fork and use however you like.
pub use crate*;
/// Recommended Regex for general use.
pub static WORD_REGEX: &str = r"(\p{Alphabetic}|\d)(\p{Alphabetic}|\d|'|-)*(\.|!|\?)?";