markov_namegen/characterchain/generator.rs
1use crate::characterchain::builder::CharacterChainGeneratorBuilder;
2use crate::interface::RandomTextGenerator;
3use log::{debug, trace};
4use multimarkov::MultiMarkov;
5use regex::Regex;
6
7/// This struct, once trained on a corpus of training data, can be used repeatedly to generate
8/// random text strings (i.e. names) that sort-of resemble the training data. At its heart is a
9/// Markov chain model that keeps track of the relative probabilities with which
10/// letters of the alphabet follow other letters in the training data set.
11///
12/// Create an instance using the builder pattern:
13/// ```
14/// use markov_namegen::CharacterChainGenerator;
15/// let dwarf_names = vec!["dopey","sneezy","bashful","sleepy","happy","grumpy","doc"].into_iter();
16/// let namegen = CharacterChainGenerator::builder().train(dwarf_names).build();
17/// ```
18///
19/// Training data can be an iterator of `String` or of `&str` type, and you can call `.train()`
20/// repeatedly, for cumulative training on more than one dataset.
21///
22/// Here's an example with all the optional settings:
23///
24/// ```
25/// use markov_namegen::CharacterChainGenerator;
26/// use rand::{rngs::SmallRng, SeedableRng};
27/// let pokedex_names = vec!["bulbasaur","charmander","squirtle","pikachu"].into_iter();
28/// let namegen = CharacterChainGenerator::builder()
29/// .with_order(2)
30/// .with_prior(0.007)
31/// .with_pattern("^[A-Za-z]{4,8}$")
32/// .preserving_case()
33/// .with_rng(Box::new(SmallRng::seed_from_u64(123)))
34/// .train(pokedex_names)
35/// .build();
36/// ```
37///
38/// You can set a pattern to filter acceptable names; for example above we are requiring that
39/// results must be 4 to 8 characters long. CharacterChainGenerator will simply re-roll new names
40/// until it finds one that matches. Be careful: if you supply a difficult-to-match pattern,
41/// name generation may be very slow; if you supply an impossible-to-match pattern, for example
42/// one that requires characters not seen in the training data, you will get an infinite loop.
43///
44/// Here's a final example that reads names from a file (one name per line), builds up a
45/// CharacterChainGenerator, and then spits out a few names:
46///
47/// ```
48/// use std::fs::File;
49/// use std::io::{BufReader, BufRead};
50/// use markov_namegen::CharacterChainGenerator;
51/// use markov_namegen::RandomTextGenerator;
52///
53/// let file = File::open("resources/romans.txt").unwrap();
54/// let reader = BufReader::new(file);
55/// let lines = reader.lines().map(|l| l.unwrap() );
56///
57/// let mut namegen = CharacterChainGenerator::builder()
58/// .train(lines)
59/// .build();
60///
61/// for _i in 0..10 {
62/// println!("{}", namegen.generate_one());
63/// }
64/// ```
65///
66#[derive(Debug)]
67pub struct CharacterChainGenerator {
68 pub(super) model: MultiMarkov<char>,
69 pub(super) pattern: Option<Regex>,
70}
71
72impl<'a> CharacterChainGenerator {
73 pub const DEFAULT_ORDER: i32 = 3;
74 pub const DEFAULT_PRIOR: f64 = 0.005;
75
76 pub fn builder() -> CharacterChainGeneratorBuilder<'a> {
77 CharacterChainGeneratorBuilder::new()
78 }
79
80 fn generate_string(&mut self) -> String {
81 // start with the beginning-of-word character
82 let mut name = vec!['#'];
83 loop {
84 // keep adding letters until we reach the end-of-word character
85 name.push(self.model.random_next(&name).unwrap());
86 if name.ends_with(&['#']) {
87 break
88 }
89 }
90 // remove the trailing and leading "#" signs
91 name.pop();
92 name.remove(0);
93 name.iter().collect::<String>()
94 }
95}
96
97impl RandomTextGenerator for CharacterChainGenerator {
98 fn generate_one(&mut self) -> String {
99 match self.pattern.clone() {
100 None => self.generate_string(),
101 Some(re) => {
102 let mut candidate = self.generate_string();
103 while !re.is_match(&candidate) {
104 debug!("CharacterChainGenerator generated '{}' which doesn't match the regex pattern. Re-rolling!", candidate);
105 candidate = self.generate_string();
106 }
107 trace!("CharacterChainGenerator generated '{}'",candidate);
108 candidate
109 }
110 }
111 }
112}