markov_strings/
lib.rs

1#![warn(missing_docs)]
2
3//! A simplistic & configurable Markov chain text generator
4//!
5//! Give it a vec of strings and generate random results.
6//! Works best with tweets, chat history, news headlines...
7//!
8//! Minimal example:
9//! ```no_run
10//! use markov_strings::*;
11//!
12//! let data: Vec<InputData> = vec![/* a lot of data */];
13//! let mut markov = Markov::new();
14//! markov.add_to_corpus(data);
15//! let result: MarkovResult = markov.generate().unwrap();
16//! ```
17
18use rand::prelude::*;
19use serde::{Deserialize, Serialize};
20use std::collections::{HashMap, HashSet};
21
22/// The input struct to build the markov-strings corpus.
23///
24/// ```rust
25/// # use markov_strings::*;
26/// let data = vec![InputData {
27///     text: "foo bar".to_string(),
28///     meta: Some("serialized value".to_string())
29/// }];
30/// ```
31///
32/// Implements `impl From<String>` so you can do
33/// ```rust
34/// # use markov_strings::*;
35/// let data: Vec<InputData> = vec!["foo bar".to_string()]
36///     .iter()
37///     .map(|s| s.to_owned().into())
38///     .collect();
39/// ```
40#[derive(Eq, PartialEq, Clone, Debug, Serialize, Deserialize)]
41pub struct InputData {
42    /// The required value from which the generator will build new strings
43    pub text: String,
44    /// An optional field can contain any serialized data that you may wish to retrieve later from the [`Result.refs`](struct.MarkovResult.html#structfield.refs) set
45    pub meta: Option<String>,
46}
47
48impl From<String> for InputData {
49    fn from(text: String) -> Self {
50        InputData { text, meta: None }
51    }
52}
53
54#[derive(Serialize, Deserialize, Debug)]
55struct DataMember {
56    state_size: usize,
57}
58
59/// Struct holding the generator's results.
60#[derive(Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
61pub struct MarkovResult {
62    /// The generated text
63    pub text: String,
64    /// A relative value based on the possible number of permutations that led to this result.
65    /// Higher is usually "better", but the threshold depends on your corpus.
66    pub score: u16,
67    /// The list of references ids that were used to create this result. To retrieve the original values
68    pub refs: Vec<usize>,
69    /// The number of tries it took to generate the result
70    pub tries: u16,
71}
72
73type Fragments = HashMap<String, Vec<usize>>;
74
75/// Struct used to import and export data.
76///
77/// See [`Markov::export()`] for more information.
78#[derive(Serialize, Deserialize, Debug)]
79pub struct ImportExport {
80    data: Vec<InputData>,
81    corpus: Corpus,
82    start_words: Fragments,
83    end_words: Fragments,
84    options: DataMember,
85}
86
87/// Struct for possible errors during the corpus building, or result generation.
88#[derive(Debug, Eq, PartialEq)]
89pub enum ErrorType {
90    /// Returned by [`Markov::generate()`] if your corpus is empty
91    CorpusEmpty,
92    /// Returned by [´Markov::set_state_size()`] if your corpus already contain data.
93    CorpusNotEmpty,
94    /// Returned by [`Markov::generate()`] if it exceeds the maximum allowed tries to generate a result.
95    TriesExceeded,
96}
97
98type MarkovResultFilter = fn(&MarkovResult) -> bool;
99type Corpus = HashMap<String, Fragments>;
100
101/// The Markov chain generator
102///
103/// 1. Initialize it empty or from saved corpus
104/// 2. Add data to complete the corpus
105/// 3. Generate results
106#[derive(Serialize, Deserialize)]
107pub struct Markov {
108    /// Raw data (a list of strings)
109    data: Vec<InputData>,
110    options: DataMember,
111    start_words: Fragments,
112    end_words: Fragments,
113    corpus: HashMap<String, Fragments>,
114    #[serde(skip)]
115    filter: Option<fn(&MarkovResult) -> bool>,
116    #[serde(skip)]
117    max_tries: u16,
118}
119
120impl Markov {
121    /// Creates an empty Markov instance
122    ///
123    /// ```rust
124    /// use markov_strings::*;
125    ///
126    /// let mut markov = Markov::new();
127    /// ```
128    pub fn new() -> Markov {
129        let opts = DataMember { state_size: 2 };
130        Markov {
131            data: vec![],
132            options: opts,
133            start_words: HashMap::new(),
134            end_words: HashMap::new(),
135            corpus: HashMap::new(),
136            filter: None,
137            max_tries: 100,
138        }
139    }
140
141    /// Creates a Markov instance from previously imported data
142    ///
143    /// See [`Markov::export()`] for more information.
144    ///
145    /// Example: load your saved corpus from a flat file with the `bincode` crate.
146    /// ```ignore
147    /// let file = File::open("dumped.db").unwrap();
148    /// let reader = BufReader::new(file);
149    /// let data = bincode::deserialize_from(reader).unwrap();
150    /// let mut markov = Markov::from_export(data);
151    /// ```
152    pub fn from_export(export: ImportExport) -> Markov {
153        Markov {
154            data: export.data,
155            options: export.options,
156            corpus: export.corpus,
157            filter: None,
158            start_words: export.start_words,
159            end_words: export.end_words,
160            max_tries: 10,
161        }
162    }
163
164    /// Sets the "state size" of your Markov generator.
165    ///
166    /// The result chain is made up of consecutive blocks of words, and each block is called a state.
167    /// Each state is itself made up of one (1) or more words.
168    ///
169    /// ```rust
170    /// # use markov_strings::*;
171    /// let data: Vec<InputData> = vec![];
172    /// # let data: Vec<InputData> = vec![
173    /// #   InputData{ text: "foo bar lorem ipsum".to_string(), meta: None },
174    /// # ];
175    /// let mut markov = Markov::new();
176    ///
177    /// // We _must_ set the state_size before adding data...
178    /// assert!(markov.set_state_size(3).is_ok());
179    ///
180    /// // ...or it will return an error
181    /// markov.add_to_corpus(data);
182    /// assert!(markov.set_state_size(4).is_err());
183    /// ```
184    ///
185    /// - A state size of `1` word will mostly output non-sense gibberish.
186    /// - A state size of `2` words can produce interesting results, when correctly filtered.
187    /// - A state size of `3` or more words will produce more intelligible results,
188    /// but you'll need a source material that will allow it while staying random enough.
189    ///
190    /// **! You CANNOT change the state_size once you've added data with [`Markov::add_to_corpus()`]**.<br>
191    /// The internal data structure is reliant on the state size, and it cannot be changed without
192    /// rebuilding the whole corpus.
193    ///
194    /// Default value `2`.
195    pub fn set_state_size(&mut self, size: usize) -> Result<&mut Self, ErrorType> {
196        if self.start_words.len() > 0 {
197            return Err(ErrorType::CorpusNotEmpty);
198        }
199        self.options.state_size = size;
200        Ok(self)
201    }
202
203    /// Adds data to your Markov instance's corpus.
204    ///
205    /// This is an expensive method that can take a few seconds,
206    /// depending on the size of your input data.
207    /// For example, adding 50.000 tweets while running on fairly decent computer takes more than 20 seconds.
208    ///
209    /// To avoid rebuilding the corpus each time you want to generate a text,
210    /// you can use [`Markov::export()`] and [`Markov::from_export()`]
211    ///
212    /// You can call `.add_to_corpus()` as many times as you need it.
213    pub fn add_to_corpus(&mut self, data: Vec<InputData>) {
214        data.iter().for_each(|o| self.data.push(o.to_owned()));
215        let state_size = self.options.state_size;
216        // let data_len = data.len();
217        // let mut data_done = 0;
218
219        // Loop through all sentences
220        for item in data.iter() {
221            // Get position of current item in self.data
222            let pos = self.data.iter().position(|o| o == item).unwrap();
223
224            // data_done += 1;
225            // println!("{:?}/{:?}", data_done, data_len);
226
227            let words = item.text.split(' ').collect::<Vec<&str>>();
228
229            let count = words.len();
230            if count < self.options.state_size {
231                continue;
232            }
233
234            // "Start words" is the list of words that can start a generated chain.
235            let start = (&words)
236                .iter()
237                .take(state_size)
238                .map(|s| s.to_owned())
239                .collect::<Vec<_>>()
240                .join(" ");
241            self.start_words.entry(start).or_insert(vec![]).push(pos);
242
243            // "End words" is the list of words that can end a generated chain
244            let end = (&words)
245                .iter()
246                .skip(count - state_size)
247                .take(state_size)
248                .map(|s| s.to_owned())
249                .collect::<Vec<&str>>()
250                .join(" ");
251            self.end_words.entry(end).or_insert(vec![]).push(pos);
252
253            // Corpus generation
254
255            // We loop through all words in the sentence to build "blocks" of `state_size`
256            // e.g. for a state_size of 2, "lorem ipsum dolor sit amet" will have the following blocks:
257            //    "lorem ipsum", "ipsum dolor", "dolor sit", and "sit amet"
258            for (i, _) in words.clone().iter().enumerate() {
259                let curr = (&words)
260                    .iter()
261                    .skip(i)
262                    .take(state_size)
263                    .map(|s| s.to_owned())
264                    .collect::<Vec<&str>>()
265                    .join(" ");
266
267                let next = (&words)
268                    .iter()
269                    .skip(i + state_size)
270                    .take(state_size)
271                    .map(|s| s.to_owned())
272                    .collect::<Vec<&str>>()
273                    .join(" ");
274
275                // Filter out fragments that are empty or too short
276                if next.len() == 0 || next.split(' ').count() < state_size {
277                    continue;
278                }
279
280                self.corpus
281                    // Get or create the "curr" block
282                    .entry(curr)
283                    .or_insert(HashMap::new())
284                    // Insert the "next" value
285                    .entry(next)
286                    .or_insert(vec![pos])
287                    .push(pos);
288            }
289        }
290    }
291
292    /// Sets a filter to ensure that outputted results match your own criteria.
293    ///
294    /// A good filter is **essential** to get interesting results out of [`Markov::generate()`].
295    /// The values you should check at minimum are the [`MarkovResult.score`](struct.MarkovResult.html#structfield.score) and [`MarkovResult.refs`](struct.MarkovResult.html#structfield.refs)' length.
296    ///
297    /// The higher these values, the "better" the results. The actual thresholds are entierely dependant
298    /// of your source material.
299    ///
300    /// ```rust
301    /// # use markov_strings::*;
302    /// let mut markov = Markov::new();
303    /// // We're going to generate tweets, so...
304    /// markov
305    ///     .set_filter(|r| {
306    ///         // Minimum score and number of references
307    ///         // to ensure good randomness
308    ///         r.score > 50 && r.refs.len() > 10
309    ///             // Max length of a tweet
310    ///             && r.text.len() <= 280
311    ///             // No mentions
312    ///             && !r.text.contains("@")
313    ///             // No urls
314    ///             && !r.text.contains("http")
315    ///   });
316    /// ```
317    pub fn set_filter(&mut self, f: MarkovResultFilter) -> &mut Self {
318        self.filter = Some(f);
319        self
320    }
321
322    /// Removes the filter, if any
323    ///
324    /// ```rust
325    /// # use markov_strings::*;
326    /// let mut markov = Markov::new();
327    /// // Those two lines a functionally identical.
328    /// markov.set_filter(|r| true);
329    /// markov.unset_filter();
330    /// ```
331    pub fn unset_filter(&mut self) -> &mut Self {
332        self.filter = None;
333        self
334    }
335
336    /// Sets the maximum number of times the generator will try to generate a result.
337    ///
338    /// If [`Markov::generate`] fails [max_tries] times to generate a sentence,
339    /// it returns an [`ErrorType.TriesExceeded`](enum.ErrorType.html#variant.TriesExceeded).
340    ///
341    /// Default value: `100`
342    pub fn set_max_tries(&mut self, tries: u16) -> &mut Self {
343        self.max_tries = tries;
344        self
345    }
346
347    /// Generates a random result from your corpus.
348    ///
349    /// ```rust
350    /// # use markov_strings::*;
351    /// let mut markov = Markov::new();
352    /// let data: Vec<InputData> = vec![/* lots of data */];
353    /// # let data: Vec<InputData> = vec![
354    /// #   InputData{ text: "foo bar lorem ipsum".to_string(), meta: None },
355    /// # ];
356    /// markov.add_to_corpus(data);
357    /// let result = markov.generate().unwrap();
358    /// ```
359    pub fn generate(&self) -> Result<MarkovResult, ErrorType> {
360        if self.corpus.len() == 0 {
361            return Err(ErrorType::CorpusEmpty);
362        }
363        let max_tries = self.max_tries;
364        let mut tries: u16 = 0;
365        let mut rng = thread_rng();
366
367        // Loop through fragments to create a complete sentence
368        for _ in 0..max_tries {
369            tries += 1;
370            let mut ended = false;
371            let mut references: HashSet<usize> = HashSet::new();
372
373            // Create an array of MarkovCorpusItems
374            // The first item is a random startWords element
375            let mut arr = vec![self.start_words.iter().choose(&mut rng).unwrap()];
376            let mut score: u16 = 0;
377
378            // Loop to build a complete sentence
379            for _ in 0..max_tries {
380                // Last value in array
381                let block = arr[arr.len() - 1];
382
383                // Find a following item in the corpus
384                let fragments = match self.corpus.get(block.0) {
385                    Some(v) => v,
386                    // If a state cannot be found, the sentence can't be completed
387                    None => break,
388                };
389                let state = fragments.iter().choose(&mut rng).unwrap();
390
391                // Add new state to list
392                arr.push(state);
393                // Save references
394                state.1.iter().for_each(|o| {
395                    references.insert(*o);
396                });
397
398                // Increment score
399                score += (self.corpus.get(block.0).unwrap().len() - 1) as u16;
400
401                // Is sentence finished?
402                if self.end_words.get(state.0).is_some() {
403                    ended = true;
404                    break;
405                }
406            }
407            let sentence = arr
408                .iter()
409                .map(|o| o.0.to_owned())
410                .collect::<Vec<_>>()
411                .join(" ")
412                .trim()
413                .to_string();
414
415            let result = MarkovResult {
416                text: sentence,
417                score,
418                refs: references.into_iter().collect::<Vec<_>>(),
419                tries,
420            };
421
422            // Sentence is not ended or incorrect
423            // let filter = options.filter_result.unwrap();
424            if !ended || (self.filter.is_some() && !self.filter.unwrap()(&result)) {
425                continue;
426            }
427
428            return Ok(result);
429        }
430
431        Err(ErrorType::TriesExceeded)
432    }
433
434    /// Gets an item from the original data.
435    ///
436    /// Use this with the indices from [`MarkovResult.refs`](struct.MarkovResult.html#structfield.refs)
437    ///
438    /// ```rust
439    /// # use markov_strings::*;
440    /// # use std::collections::{HashMap, HashSet};
441    /// let data: Vec<InputData> = vec![
442    ///   InputData{ text: "foo bar lorem ipsum".to_string(), meta: Some("something".to_string()) },
443    /// ];
444    /// let mut markov = Markov::new();
445    /// markov.add_to_corpus(data);
446    /// let result = markov.generate().unwrap();
447    ///
448    /// // Since we only have 1 string in our corpus, we have 1 ref...
449    /// let mut expected: Vec<usize> = vec![];
450    /// expected.push(0);
451    /// assert_eq!(result.refs, expected);
452    /// let input_ref = *result.refs.get(0).unwrap();
453    /// assert_eq!(markov.get_input_ref(input_ref).unwrap().text, "foo bar lorem ipsum");
454    /// assert_eq!(markov.get_input_ref(input_ref).unwrap().meta, Some("something".to_string()));
455    /// ```
456    pub fn get_input_ref(self: &Self, index: usize) -> Option<&InputData> {
457        self.data.get(index)
458    }
459
460    /// Exports the corpus into a serializable structure.
461    ///
462    /// The [`Markov::add_to_corpus()`] method being expensive, you may want to build your corpus once,
463    /// then export it to a serializable file file for later use.
464    ///
465    /// ```rust
466    /// # use markov_strings::*;
467    /// let data: Vec<InputData> = vec![];
468    /// let mut markov = Markov::new();
469    /// markov.add_to_corpus(data);
470    /// let export = markov.export();
471    ///
472    /// let markov = Markov::from_export(export);
473    /// let result = markov.generate();
474    /// ```
475    pub fn export(self) -> ImportExport {
476        return ImportExport {
477            data: self.data,
478            options: self.options,
479            corpus: self.corpus,
480            start_words: self.start_words,
481            end_words: self.end_words,
482        };
483    }
484}
485
486#[cfg(test)]
487mod tests {
488    use super::*;
489
490    fn get_example_data() -> Vec<InputData> {
491        let data: Vec<&str> = vec![
492            "Lorem ipsum dolor sit amet",
493            "Lorem ipsum duplicate start words",
494            "Consectetur adipiscing elit",
495            "Quisque tempor, erat vel lacinia imperdiet",
496            "Justo nisi fringilla dui",
497            "Egestas bibendum eros nisi ut lacus",
498            "fringilla dui avait annoncé une rupture avec le erat vel: il n'en est rien…",
499            "Fusce tincidunt tempor, erat vel lacinia vel ex pharetra pretium lacinia imperdiet",
500        ];
501        data.iter()
502            .map(|s| InputData {
503                text: s.to_string(),
504                meta: None,
505            })
506            .collect()
507    }
508
509    #[test]
510    fn constructor_has_default_state_size() {
511        let markov = Markov::new();
512        assert!(markov.options.state_size == 2)
513    }
514
515    #[test]
516    fn set_state_size_works() {
517        let mut markov = Markov::new();
518        markov.set_state_size(3).unwrap();
519        assert_eq!(markov.options.state_size, 3)
520    }
521
522    #[test]
523    fn add_to_corpus_works() {
524        let mut markov = Markov::new();
525        assert_eq!(markov.corpus.len(), 0);
526        markov.add_to_corpus(get_example_data());
527        assert_eq!(markov.corpus.len(), 28)
528    }
529
530    #[test]
531    fn start_words_should_have_the_right_length() {
532        let mut markov = Markov::new();
533        markov.add_to_corpus(get_example_data());
534        assert_eq!(markov.start_words.len(), 7 as usize);
535    }
536
537    #[test]
538    fn start_words_should_contain_the_right_values() {
539        let mut markov = Markov::new();
540        markov.add_to_corpus(get_example_data());
541        let fragments = &markov.start_words;
542        assert!(fragments.iter().any(|o| o.0 == "Lorem ipsum"));
543        assert!(fragments.iter().any(|o| o.0 == "Consectetur adipiscing"));
544        assert!(fragments.iter().any(|o| o.0 == "Quisque tempor,"));
545        assert!(fragments.iter().any(|o| o.0 == "Justo nisi"));
546        assert!(fragments.iter().any(|o| o.0 == "Egestas bibendum"));
547        assert!(fragments.iter().any(|o| o.0 == "fringilla dui"));
548        assert!(fragments.iter().any(|o| o.0 == "Fusce tincidunt"));
549    }
550
551    #[test]
552    fn end_words_should_have_the_right_length() {
553        let mut markov = Markov::new();
554        markov.add_to_corpus(get_example_data());
555        assert_eq!(markov.end_words.len(), 7 as usize);
556    }
557
558    #[test]
559    fn end_words_should_contain_the_right_values() {
560        let mut markov = Markov::new();
561        markov.add_to_corpus(get_example_data());
562        let fragments = &markov.end_words;
563        assert!(fragments.iter().any(|o| o.0 == "sit amet"));
564        assert!(fragments.iter().any(|o| o.0 == "start words"));
565        assert!(fragments.iter().any(|o| o.0 == "adipiscing elit"));
566        assert!(fragments.iter().any(|o| o.0 == "fringilla dui"));
567        assert!(fragments.iter().any(|o| o.0 == "ut lacus"));
568        assert!(fragments.iter().any(|o| o.0 == "est rien…"));
569    }
570
571    #[test]
572    fn corpus_should_have_the_right_values_for_the_right_keys() {
573        let mut markov = Markov::new();
574        markov.add_to_corpus(get_example_data());
575        let fragments = &markov.corpus.get("Lorem ipsum").unwrap();
576        assert!(fragments.iter().any(|f| f.0 == "dolor sit"));
577        assert!(fragments.iter().any(|f| f.0 == "duplicate start"));
578        let fragments = &markov.corpus.get("tempor, erat").unwrap();
579        assert!(fragments.iter().any(|f| f.0 == "vel lacinia"));
580    }
581
582    #[test]
583    fn generator_should_return_err_if_the_corpus_is_not_build() {
584        let markov = Markov::new();
585        let res = markov.generate();
586        assert_eq!(res.unwrap_err(), ErrorType::CorpusEmpty);
587    }
588
589    #[test]
590    fn generator_should_return_a_result_under_the_tries_limit() {
591        let mut markov = Markov::new();
592        markov.add_to_corpus(get_example_data());
593        for _ in 0..10 {
594            let sentence = markov.generate();
595            assert!(sentence.unwrap().tries < 20);
596        }
597    }
598
599    #[test]
600    fn generator_should_return_error() {
601        // Arrange
602        let mut markov = Markov::new();
603        markov.add_to_corpus(get_example_data());
604
605        // Act
606        let result = markov.set_filter(|_| false).generate();
607
608        // Assert
609        assert_eq!(result.unwrap_err(), ErrorType::TriesExceeded);
610    }
611
612    #[test]
613    fn result_should_end_with_an_endwords_item() {
614        // Arrange
615        let mut markov = Markov::new();
616        markov.add_to_corpus(get_example_data());
617
618        for _ in 0..10 {
619            // Act
620            let result = markov.generate().unwrap();
621            let arr = result.text.split(' ').collect::<Vec<_>>();
622            let len = arr.len();
623            let end = arr
624                .into_iter()
625                .skip(len - 2)
626                .take(2)
627                .collect::<Vec<_>>()
628                .join(" ");
629            // Assert
630            assert!(markov.end_words.iter().any(|f| f.0 == &end));
631        }
632    }
633
634    #[test]
635    fn input_data_from_string() {
636        let text = "foo";
637        let input = InputData::from(text.to_owned());
638        assert_eq!(input.text, "foo");
639        assert_eq!(input.meta, None);
640
641        let texts = vec!["foo".to_string()];
642        let mut markov = Markov::new();
643        markov.add_to_corpus(
644            texts
645                .iter()
646                .map(|t| t.to_owned().into())
647                .collect::<Vec<_>>(),
648        );
649    }
650}