shitpost_markov 1.0.1

Backend for training a Markov bot at compile-time.
Documentation
use std::{collections::HashMap, sync::Mutex};

use regex::Regex;
use string_interner::{DefaultBackend, StringInterner};

use crate::markov::TokenType;

pub type Tokens = HashMap<TokenType, HashMap<TokenType, usize>>;

lazy_static::lazy_static! {
	pub static ref INTERN: Mutex<StringInterner<DefaultBackend>> = Mutex::new(StringInterner::default());
	static ref TOKEN: Regex = Regex::new(r#"
        (?x)                                                    # ignore whitespace in regex
        (?<whitespace> \s+)?                                    # whitespace before tag
        <(?<close>/)?                                           # throw away closing tags
          (?<tag>       [[:alnum:]_-]+)
          (?<attrs>     (?:\s[[:alnum:]_-]+(?:=\\"[^"]*\\")?)*)
          (?<autoclose> /)?>                                    # detect if autoclosing tag
        | (?<html>      &[\w\#]+;                               # html sanitased symbols
        | (?<word>)
            \s*[\w’]+                                           # match words with spaces
          | \s*[[:punct:]&&[^<>]])                              # match punctuation with spaces
        "#.trim()).unwrap();
}

macro_rules! insert {
	($tokens:ident, $prev_token:ident, $token:expr) => {
		match $tokens.get_mut(&$prev_token) {
			Some(tokens) => {
				match tokens.get_mut(&$token) {
					Some(count) => *count += 1,
					None => {
						tokens.insert($token, 1);
					},
				};
			},
			None => {
				let mut map = HashMap::with_capacity(
					$crate::TOKEN_SUB_CAPACITY,
				);
				map.insert($token, 1);
				$tokens.insert($prev_token, map);
			},
		};
		$prev_token = $token;
	};
}

#[allow(unused_assignments)]
pub fn tokenise(post: &str, tokens: &Mutex<Tokens>) {
	let mut post = post;
	let mut tokens = tokens.lock().unwrap();
	let mut intern = INTERN.lock().unwrap();

	let mut prev_token = TokenType::Start;
	while let Some(cap) = TOKEN.captures(post) {
		post = &post[cap.get(0).unwrap().end()..];
		if cap.name("close").is_some() {
			insert!(tokens, prev_token, TokenType::End);
		} else if let Some(tag) = cap.name("tag") {
			let attrs = match cap.name("attrs") {
				Some(attrs) => intern.get_or_intern(attrs.as_str()),
				None => intern.get_or_intern_static(""),
			};

			let whitespace = cap
				.name("whitespace")
				.map(|w| intern.get_or_intern(w.as_str()));
			let tag = intern.get_or_intern(tag.as_str());
			insert!(
				tokens,
				prev_token,
				TokenType::Tag {
					whitespace,
					tag,
					attrs,
				}
			);

			if cap.name("autoclose").is_some() {
				insert!(tokens, prev_token, TokenType::End);
			}
		} else if let Some(html) = cap.name("html") {
			let html =
				html_escape::decode_html_entities(html.as_str());
			insert!(
				tokens,
				prev_token,
				TokenType::Text(intern.get_or_intern(&html))
			);
		} else if let Some(word) = cap.name("word") {
			insert!(
				tokens,
				prev_token,
				TokenType::Text(intern.get_or_intern(word.as_str()))
			);
		}
	}
	insert!(tokens, prev_token, TokenType::End);
}