use crate::loader::TalkConfig;
use regex::Regex;
use std::collections::HashSet;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum TokenKind {
SakuraScript,
Period,
Comma,
Strong,
Leader,
LineStartProhibited,
LineEndProhibited,
General,
}
#[derive(Debug, Clone)]
pub struct Token {
pub kind: TokenKind,
pub text: String,
}
impl Token {
pub fn new(kind: TokenKind, text: impl Into<String>) -> Self {
Self {
kind,
text: text.into(),
}
}
}
#[derive(Debug, Clone)]
pub struct CharSets {
pub period: HashSet<char>,
pub comma: HashSet<char>,
pub strong: HashSet<char>,
pub leader: HashSet<char>,
pub line_start_prohibited: HashSet<char>,
pub line_end_prohibited: HashSet<char>,
}
impl CharSets {
pub fn from_config(config: &TalkConfig) -> Self {
Self {
period: config.chars_period.chars().collect(),
comma: config.chars_comma.chars().collect(),
strong: config.chars_strong.chars().collect(),
leader: config.chars_leader.chars().collect(),
line_start_prohibited: config.chars_line_start_prohibited.chars().collect(),
line_end_prohibited: config.chars_line_end_prohibited.chars().collect(),
}
}
pub fn classify(&self, c: char) -> TokenKind {
if self.period.contains(&c) {
TokenKind::Period
} else if self.comma.contains(&c) {
TokenKind::Comma
} else if self.strong.contains(&c) {
TokenKind::Strong
} else if self.leader.contains(&c) {
TokenKind::Leader
} else if self.line_start_prohibited.contains(&c) {
TokenKind::LineStartProhibited
} else if self.line_end_prohibited.contains(&c) {
TokenKind::LineEndProhibited
} else {
TokenKind::General
}
}
}
pub struct Tokenizer {
sakura_tag_regex: Regex,
char_sets: CharSets,
}
impl Tokenizer {
pub const SAKURA_TAG_PATTERN: &'static str = r"\\[0-9a-zA-Z_!+*?&-]+(?:\[[^\]]*\])?";
pub fn new(config: &TalkConfig) -> Result<Self, regex::Error> {
let sakura_tag_regex = Regex::new(Self::SAKURA_TAG_PATTERN)?;
let char_sets = CharSets::from_config(config);
Ok(Self {
sakura_tag_regex,
char_sets,
})
}
pub fn tag_regex(&self) -> &Regex {
&self.sakura_tag_regex
}
pub fn tokenize(&self, input: &str) -> Vec<Token> {
let mut tokens = Vec::new();
let mut pos = 0;
let bytes = input.as_bytes();
while pos < input.len() {
if bytes[pos] == b'\\'
&& let Some(mat) = self.sakura_tag_regex.find(&input[pos..])
&& mat.start() == 0
{
tokens.push(Token::new(TokenKind::SakuraScript, mat.as_str()));
pos += mat.len();
continue;
}
let remaining = &input[pos..];
if let Some(c) = remaining.chars().next() {
let kind = self.char_sets.classify(c);
tokens.push(Token::new(kind, c.to_string()));
pos += c.len_utf8();
} else {
break;
}
}
tokens
}
}
#[cfg(test)]
mod tests {
use super::*;
fn default_config() -> TalkConfig {
TalkConfig::default()
}
#[test]
fn test_tokenize_general_text() {
let tokenizer = Tokenizer::new(&default_config()).unwrap();
let tokens = tokenizer.tokenize("こんにちは");
assert_eq!(tokens.len(), 5);
for token in &tokens {
assert_eq!(token.kind, TokenKind::General);
}
}
#[test]
fn test_tokenize_sakura_script_tag() {
let tokenizer = Tokenizer::new(&default_config()).unwrap();
let tokens = tokenizer.tokenize(r"\h\s[0]こんにちは");
assert_eq!(tokens.len(), 7); assert_eq!(tokens[0].kind, TokenKind::SakuraScript);
assert_eq!(tokens[0].text, r"\h");
assert_eq!(tokens[1].kind, TokenKind::SakuraScript);
assert_eq!(tokens[1].text, r"\s[0]");
}
#[test]
fn test_tokenize_period() {
let tokenizer = Tokenizer::new(&default_config()).unwrap();
let tokens = tokenizer.tokenize("。..");
assert_eq!(tokens.len(), 3);
for token in &tokens {
assert_eq!(token.kind, TokenKind::Period);
}
}
#[test]
fn test_tokenize_comma() {
let tokenizer = Tokenizer::new(&default_config()).unwrap();
let tokens = tokenizer.tokenize("、,,");
assert_eq!(tokens.len(), 3);
for token in &tokens {
assert_eq!(token.kind, TokenKind::Comma);
}
}
#[test]
fn test_tokenize_strong() {
let tokenizer = Tokenizer::new(&default_config()).unwrap();
let tokens = tokenizer.tokenize("!?!?");
assert_eq!(tokens.len(), 4);
for token in &tokens {
assert_eq!(token.kind, TokenKind::Strong);
}
}
#[test]
fn test_tokenize_leader() {
let tokenizer = Tokenizer::new(&default_config()).unwrap();
let tokens = tokenizer.tokenize("・‥…");
assert_eq!(tokens.len(), 3);
for token in &tokens {
assert_eq!(token.kind, TokenKind::Leader);
}
}
#[test]
fn test_tokenize_line_prohibited() {
let tokenizer = Tokenizer::new(&default_config()).unwrap();
let tokens = tokenizer.tokenize("」』");
assert_eq!(tokens[0].kind, TokenKind::LineStartProhibited);
assert_eq!(tokens[1].kind, TokenKind::LineStartProhibited);
let tokens = tokenizer.tokenize("「『");
assert_eq!(tokens[0].kind, TokenKind::LineEndProhibited);
assert_eq!(tokens[1].kind, TokenKind::LineEndProhibited);
}
#[test]
fn test_tokenize_mixed() {
let tokenizer = Tokenizer::new(&default_config()).unwrap();
let tokens = tokenizer.tokenize(r"\hこんにちは。");
assert_eq!(tokens.len(), 7); assert_eq!(tokens[0].kind, TokenKind::SakuraScript);
assert_eq!(tokens[6].kind, TokenKind::Period);
}
#[test]
fn test_tokenize_complex_tag() {
let tokenizer = Tokenizer::new(&default_config()).unwrap();
let tokens = tokenizer.tokenize(r"\_w[500]テスト");
assert_eq!(tokens[0].kind, TokenKind::SakuraScript);
assert_eq!(tokens[0].text, r"\_w[500]");
}
#[test]
fn test_tokenize_unicode_preservation() {
let tokenizer = Tokenizer::new(&default_config()).unwrap();
let tokens = tokenizer.tokenize("が");
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].text, "が");
}
#[test]
fn test_tokenize_empty_string() {
let tokenizer = Tokenizer::new(&default_config()).unwrap();
let tokens = tokenizer.tokenize("");
assert!(tokens.is_empty());
}
#[test]
fn test_tokenize_consecutive_punctuation() {
let tokenizer = Tokenizer::new(&default_config()).unwrap();
let tokens = tokenizer.tokenize("」」」!?。、");
assert_eq!(tokens.len(), 7);
assert_eq!(tokens[0].kind, TokenKind::LineStartProhibited);
assert_eq!(tokens[1].kind, TokenKind::LineStartProhibited);
assert_eq!(tokens[2].kind, TokenKind::LineStartProhibited);
assert_eq!(tokens[3].kind, TokenKind::Strong);
assert_eq!(tokens[4].kind, TokenKind::Strong);
assert_eq!(tokens[5].kind, TokenKind::Period);
assert_eq!(tokens[6].kind, TokenKind::Comma);
}
#[test]
fn test_tokenize_symbol_tag_hyphen() {
let tokenizer = Tokenizer::new(&default_config()).unwrap();
let tokens = tokenizer.tokenize(r"\-");
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, TokenKind::SakuraScript);
assert_eq!(tokens[0].text, r"\-");
}
#[test]
fn test_tokenize_symbol_tag_plus() {
let tokenizer = Tokenizer::new(&default_config()).unwrap();
let tokens = tokenizer.tokenize(r"\+");
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, TokenKind::SakuraScript);
assert_eq!(tokens[0].text, r"\+");
}
#[test]
fn test_tokenize_symbol_tag_asterisk() {
let tokenizer = Tokenizer::new(&default_config()).unwrap();
let tokens = tokenizer.tokenize(r"\*");
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, TokenKind::SakuraScript);
assert_eq!(tokens[0].text, r"\*");
}
#[test]
fn test_tokenize_symbol_tag_underscore_question() {
let tokenizer = Tokenizer::new(&default_config()).unwrap();
let tokens = tokenizer.tokenize(r"\_?");
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, TokenKind::SakuraScript);
assert_eq!(tokens[0].text, r"\_?");
}
#[test]
fn test_tokenize_symbol_tag_ampersand() {
let tokenizer = Tokenizer::new(&default_config()).unwrap();
let tokens = tokenizer.tokenize(r"\&[ID]");
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, TokenKind::SakuraScript);
assert_eq!(tokens[0].text, r"\&[ID]");
}
#[test]
fn test_tokenize_symbol_tag_mixed_text() {
let tokenizer = Tokenizer::new(&default_config()).unwrap();
let tokens = tokenizer.tokenize(r"こんにちは\-。");
assert_eq!(tokens.len(), 7);
assert_eq!(tokens[0].kind, TokenKind::General); assert_eq!(tokens[1].kind, TokenKind::General); assert_eq!(tokens[2].kind, TokenKind::General); assert_eq!(tokens[3].kind, TokenKind::General); assert_eq!(tokens[4].kind, TokenKind::General); assert_eq!(tokens[5].kind, TokenKind::SakuraScript); assert_eq!(tokens[5].text, r"\-");
assert_eq!(tokens[6].kind, TokenKind::Period); }
}