use std::collections::{HashMap, HashSet};
use std::str::CharIndices;
#[derive(Clone)]
pub struct MmdTokenizer {
skip_commands: HashMap<&'static str, &'static str>,
skip_tokens: HashSet<&'static str>,
drop_commands: HashSet<&'static str>,
known_commands: HashSet<&'static str>,
}
impl Default for MmdTokenizer {
fn default() -> Self {
static SKIP_COMMANDS: [(&str, &str); 6] = [
("\\begin{table}", "\\end{table}"),
("\\(", "\\)"),
("\\[", "\\]"),
("\\begin{tabular}", "\\end{tabular}"),
("\\begin{figure}", "\\end{figure}"),
("$$", "$$"),
];
static SKIP_TOKENS: [&str; 33] = [
"#",
"##",
"###",
"####",
"#####",
"######",
"\\",
"\\begin{theorem}",
"\\end{theorem}",
"\\begin{lemma}",
"\\end{lemma}",
"\\begin{itemize}",
"\\end{itemize}",
"\\begin{equation}",
"\\end{equation}",
"\\begin{equation*}",
"\\end{equation*}",
"\\begin{align}",
"\\end{align}",
"\\begin{align*}",
"\\end{align*}",
"\\begin{split}",
"\\end{split}",
"\\begin{split*}",
"\\end{split*}",
"\\begin{gather}",
"\\end{gather}",
"\\begin{gather*}",
"\\end{gather*}",
"\\end{table}",
"\\end{tabular}",
"\\end{figure}",
"\\pagebreak",
];
static DROP_COMMANDS: [&str; 17] = [
"\\footnote",
"\\footnotemark",
"\\underline",
"\\uline",
"\\uwave",
"\\dashuline",
"\\dotuline",
"\\sout",
"\\xout",
"\\title",
"\\author",
"\\section",
"\\subsection",
"\\subsubsection",
"\\textit",
"\\textbf",
"\\url",
];
static KNOWN_COMMANDS: [&str; 3] = ["\\pagebreak", "\\begin", "\\end"];
MmdTokenizer {
skip_commands: HashMap::from_iter(SKIP_COMMANDS),
skip_tokens: HashSet::from_iter(SKIP_TOKENS),
drop_commands: HashSet::from_iter(DROP_COMMANDS),
known_commands: HashSet::from_iter(KNOWN_COMMANDS),
}
}
}
pub struct MmdTokenStream<'a> {
skip_list: Option<Vec<(usize, usize)>>,
skip_iter: usize,
chars: CharIndices<'a>,
token: tantivy::tokenizer::Token,
stacked_char: Option<(char, usize)>,
skip_commands: &'a HashMap<&'static str, &'static str>,
skip_tokens: &'a HashSet<&'static str>,
drop_commands: &'a HashSet<&'static str>,
known_commands: &'a HashSet<&'static str>,
base_offset: usize,
maybe_link: bool,
}
#[inline]
pub fn accept_char(token: &mut tantivy::tokenizer::Token, c: char, offset: usize) {
if token.offset_from == usize::MAX {
token.offset_from = offset;
}
token.offset_to = offset + c.len_utf8();
token.text.push(c);
}
impl<'a> MmdTokenStream<'a> {
pub fn new(
text: &'a str,
skip_commands: &'a HashMap<&'static str, &'static str>,
skip_tokens: &'a HashSet<&'static str>,
drop_commands: &'a HashSet<&'static str>,
known_commands: &'a HashSet<&'static str>,
) -> MmdTokenStream<'a> {
MmdTokenStream {
skip_list: None,
skip_iter: 0,
chars: text.char_indices(),
token: tantivy::tokenizer::Token::default(),
stacked_char: None,
skip_commands,
skip_tokens,
drop_commands,
known_commands,
base_offset: 0,
maybe_link: false,
}
}
pub fn new_with_offset_and_position(
text: &'a str,
offset: usize,
position: usize,
skip_list: Option<Vec<(usize, usize)>>,
skip_commands: &'a HashMap<&'static str, &'static str>,
skip_tokens: &'a HashSet<&'static str>,
drop_commands: &'a HashSet<&'static str>,
known_commands: &'a HashSet<&'static str>,
) -> MmdTokenStream<'a> {
let token = tantivy::tokenizer::Token {
position,
..Default::default()
};
MmdTokenStream {
skip_list,
skip_iter: 0,
chars: text.char_indices(),
token,
stacked_char: None,
skip_commands,
skip_tokens,
drop_commands,
known_commands,
base_offset: offset,
maybe_link: false,
}
}
pub fn token(&self) -> &tantivy::tokenizer::Token {
&self.token
}
pub fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token {
&mut self.token
}
fn advance_token(&mut self, update_position: bool) -> bool {
self.token.text.clear();
if update_position {
self.token.position = self.token.position.wrapping_add(1);
}
self.token.offset_from = usize::MAX;
let mut is_command = false;
let mut spec_counter = 0;
let mut start_skipping_round_bracket = false;
let mut skipped_round_bracket = 0;
let mut start_skipping_figure_bracket = false;
let mut skipped_figure_bracket = 0;
if let Some((stacked_char, stacked_offset)) = self.stacked_char.take() {
accept_char(&mut self.token, stacked_char, self.base_offset + stacked_offset);
if is_cjk(&stacked_char) {
return true;
}
if stacked_char == '\\' {
is_command = true;
}
if stacked_char == '[' {
self.maybe_link = true;
}
}
for (offset, c) in &mut self.chars {
eprintln!("status {} {} {}", c, is_command, spec_counter);
let real_offset = self.base_offset + offset;
if let Some(skip_list) = &self.skip_list {
while self.skip_iter < skip_list.len() && skip_list[self.skip_iter].1 <= real_offset {
self.skip_iter += 1;
}
if self.skip_iter < skip_list.len() && skip_list[self.skip_iter].0 <= real_offset && real_offset < skip_list[self.skip_iter].1 {
continue;
}
}
if start_skipping_round_bracket || skipped_round_bracket > 0 {
start_skipping_round_bracket = false;
if c == '(' {
skipped_round_bracket += 1;
continue;
} else if c == ')' {
skipped_round_bracket -= 1;
if skipped_round_bracket == 0 {
start_skipping_figure_bracket = true;
}
continue;
} else if skipped_round_bracket > 0 {
continue;
}
}
if start_skipping_figure_bracket || skipped_figure_bracket > 0 {
start_skipping_figure_bracket = false;
if c == '{' {
skipped_figure_bracket += 1;
continue;
} else if c == '}' {
skipped_figure_bracket -= 1;
continue;
} else if skipped_figure_bracket > 0 {
continue;
}
}
if is_cjk(&c) {
if !self.token.text.is_empty() {
self.stacked_char = Some((c, offset));
return true;
}
accept_char(&mut self.token, c, real_offset);
return true;
}
if c == '\\' {
if !self.token.text.is_empty() {
self.stacked_char = Some((c, offset));
return true;
}
is_command = true;
accept_char(&mut self.token, c, real_offset);
} else if c == '[' && !is_command {
if !self.token.text.is_empty() {
self.stacked_char = Some((c, offset));
return true;
}
self.maybe_link = true;
} else if c == ']' && self.maybe_link && !is_command {
self.maybe_link = false;
start_skipping_round_bracket = true;
} else if c == '^' || c == '~' {
self.token.offset_to += 1;
} else if c == '*' || c == '_' {
spec_counter += 1;
} else if c.is_alphanumeric() || c == '#' || c == '+' {
if spec_counter == 1 {
self.stacked_char = Some((c, offset));
return true;
} else if spec_counter > 1 {
self.token.offset_to += spec_counter;
spec_counter = 0;
};
accept_char(&mut self.token, c, real_offset);
} else if is_command && (c == '(' || c == ')' || c == '[' || c == ']') && self.token.text.len() == 1 {
accept_char(&mut self.token, c, real_offset);
break;
} else if is_command && (c == '{' || c == '}') {
if self.drop_commands.contains(&self.token.text.as_str()) {
is_command = false;
self.token.text.clear();
self.token.offset_from = usize::MAX;
continue;
} else if c == '{' && !self.known_commands.contains(&self.token.text.as_str()) {
break;
}
accept_char(&mut self.token, c, real_offset);
if c == '}' {
break;
}
} else if !self.token.text.is_empty() {
break;
}
}
!self.token.text.is_empty()
}
}
impl tantivy::tokenizer::Tokenizer for MmdTokenizer {
type TokenStream<'a> = MmdTokenStream<'a>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> MmdTokenStream<'a> {
MmdTokenStream::new(text, &self.skip_commands, &self.skip_tokens, &self.drop_commands, &self.known_commands)
}
}
#[inline]
fn is_cjk(c: &char) -> bool {
(0x4e00 <= *c as u32 && *c as u32 <= 0x9FFF)
|| (0x3400 <= *c as u32 && *c as u32 <= 0x4DBF)
|| (0x20000 <= *c as u32 && *c as u32 <= 0x2A6DF)
|| (0x2A700 <= *c as u32 && *c as u32 <= 0x2B73F)
|| (0x2B740 <= *c as u32 && *c as u32 <= 0x2B81F)
}
impl<'a> tantivy::tokenizer::TokenStream for MmdTokenStream<'a> {
fn advance(&mut self) -> bool {
let mut result = self.advance_token(true);
while result {
if self.skip_tokens.contains(&self.token.text.as_str()) {
result = self.advance_token(false);
} else if let Some(end_command) = self.skip_commands.get(self.token.text.as_str()) {
while result && self.token.text != *end_command {
result = self.advance_token(false);
}
result = self.advance_token(false);
} else {
while self.token.text.starts_with('\\') {
self.token.offset_from += 1;
self.token.text = self.token.text[1..].to_string()
}
if self.token.text == "]" || self.token.text == "}" || self.token.text == ")" {
result = self.advance_token(false);
} else {
break;
}
}
}
result
}
fn token(&self) -> &tantivy::tokenizer::Token {
&self.token
}
fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token {
&mut self.token
}
}
#[cfg(test)]
pub mod tests {
use tantivy::tokenizer::{LowerCaser, RemoveLongFilter, TextAnalyzer, Token, TokenizerManager};
use super::MmdTokenizer;
pub fn assert_tokenization(tokenizer: &mut TextAnalyzer, text: &str, response: &[Token]) {
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
tokenizer.token_stream(text).process(&mut add_token);
}
assert_eq!(tokens, response);
}
#[test]
fn test_en_tokenizer() {
let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register(
"tokenizer",
TextAnalyzer::builder(MmdTokenizer::default())
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.build(),
);
let mut tokenizer = tokenizer_manager.get("tokenizer").unwrap();
assert_tokenization(&mut tokenizer, "#", &[]);
assert_tokenization(
&mut tokenizer,
"# Header1",
&[Token {
offset_from: 2,
offset_to: 9,
position: 0,
text: "header1".to_string(),
position_length: 1,
}],
);
assert_tokenization(&mut tokenizer, "\\begin{table}\\end{table}", &[]);
assert_tokenization(
&mut tokenizer,
"\\begin{table}\\end{table}a",
&[Token {
offset_from: 24,
offset_to: 25,
position: 0,
text: "a".to_string(),
position_length: 1,
}],
);
assert_tokenization(&mut tokenizer, "\\begin{table}# Header 1\\end{table}", &[]);
assert_tokenization(&mut tokenizer, "\\end{table}", &[]);
assert_tokenization(
&mut tokenizer,
"# Header1\nHello, 1 \\ 2 world! \\begin{table}table content\\end{table}\n\\begin{theorem}\ntheorem content\\end{theorem}",
&[
Token {
offset_from: 2,
offset_to: 9,
position: 0,
text: "header1".to_string(),
position_length: 1,
},
Token {
offset_from: 10,
offset_to: 15,
position: 1,
text: "hello".to_string(),
position_length: 1,
},
Token {
offset_from: 17,
offset_to: 18,
position: 2,
text: "1".to_string(),
position_length: 1,
},
Token {
offset_from: 21,
offset_to: 22,
position: 3,
text: "2".to_string(),
position_length: 1,
},
Token {
offset_from: 23,
offset_to: 28,
position: 4,
text: "world".to_string(),
position_length: 1,
},
Token {
offset_from: 84,
offset_to: 91,
position: 5,
text: "theorem".to_string(),
position_length: 1,
},
Token {
offset_from: 92,
offset_to: 99,
position: 6,
text: "content".to_string(),
position_length: 1,
},
],
);
assert_tokenization(
&mut tokenizer,
"# Header1 \\footnote{footnote text}# Header2 \\uline{\\uline{double line}}",
&[
Token {
offset_from: 2,
offset_to: 9,
position: 0,
text: "header1".to_string(),
position_length: 1,
},
Token {
offset_from: 20,
offset_to: 28,
position: 1,
text: "footnote".to_string(),
position_length: 1,
},
Token {
offset_from: 29,
offset_to: 33,
position: 2,
text: "text".to_string(),
position_length: 1,
},
Token {
offset_from: 36,
offset_to: 43,
position: 3,
text: "header2".to_string(),
position_length: 1,
},
Token {
offset_from: 58,
offset_to: 64,
position: 4,
text: "double".to_string(),
position_length: 1,
},
Token {
offset_from: 65,
offset_to: 69,
position: 5,
text: "line".to_string(),
position_length: 1,
},
],
);
assert_tokenization(
&mut tokenizer,
"\\noncommand \\noncommand2 \\",
&[
Token {
offset_from: 1,
offset_to: 11,
position: 0,
text: "noncommand".to_string(),
position_length: 1,
},
Token {
offset_from: 13,
offset_to: 24,
position: 1,
text: "noncommand2".to_string(),
position_length: 1,
},
],
);
assert_tokenization(
&mut tokenizer,
"\\command{weird}",
&[
Token {
offset_from: 1,
offset_to: 8,
position: 0,
text: "command".to_string(),
position_length: 1,
},
Token {
offset_from: 9,
offset_to: 14,
position: 1,
text: "weird".to_string(),
position_length: 1,
},
],
);
assert_tokenization(
&mut tokenizer,
"word1 \\(x_1 + x_2\\) \\word2",
&[
Token {
offset_from: 0,
offset_to: 5,
position: 0,
text: "word1".to_string(),
position_length: 1,
},
Token {
offset_from: 21,
offset_to: 26,
position: 1,
text: "word2".to_string(),
position_length: 1,
},
],
);
assert_tokenization(
&mut tokenizer,
"Love**is**bold",
&[Token {
offset_from: 0,
offset_to: 14,
position: 0,
text: "loveisbold".to_string(),
position_length: 1,
}],
);
assert_tokenization(
&mut tokenizer,
"Love*is*bold",
&[
Token {
offset_from: 0,
offset_to: 4,
position: 0,
text: "love".to_string(),
position_length: 1,
},
Token {
offset_from: 5,
offset_to: 7,
position: 1,
text: "is".to_string(),
position_length: 1,
},
Token {
offset_from: 8,
offset_to: 12,
position: 2,
text: "bold".to_string(),
position_length: 1,
},
],
);
assert_tokenization(
&mut tokenizer,
"Love **is*bold",
&[
Token {
offset_from: 0,
offset_to: 4,
position: 0,
text: "love".to_string(),
position_length: 1,
},
Token {
offset_from: 7,
offset_to: 9,
position: 1,
text: "is".to_string(),
position_length: 1,
},
Token {
offset_from: 10,
offset_to: 14,
position: 2,
text: "bold".to_string(),
position_length: 1,
},
],
);
assert_tokenization(
&mut tokenizer,
"How to do x^2",
&[
Token {
offset_from: 0,
offset_to: 3,
position: 0,
text: "how".to_string(),
position_length: 1,
},
Token {
offset_from: 4,
offset_to: 6,
position: 1,
text: "to".to_string(),
position_length: 1,
},
Token {
offset_from: 7,
offset_to: 9,
position: 2,
text: "do".to_string(),
position_length: 1,
},
Token {
offset_from: 10,
offset_to: 13,
position: 3,
text: "x2".to_string(),
position_length: 1,
},
],
);
assert_tokenization(&mut tokenizer, "![]()", &[]);
assert_tokenization(
&mut tokenizer,
"{width=1}",
&[
Token {
offset_from: 2,
offset_to: 7,
position: 0,
text: "image".to_string(),
position_length: 1,
},
Token {
offset_from: 8,
offset_to: 12,
position: 1,
text: "text".to_string(),
position_length: 1,
},
],
);
assert_tokenization(
&mut tokenizer,
"[ref] (author)",
&[
Token {
offset_from: 1,
offset_to: 4,
position: 0,
text: "ref".to_string(),
position_length: 1,
},
Token {
offset_from: 7,
offset_to: 13,
position: 1,
text: "author".to_string(),
position_length: 1,
},
],
);
assert_tokenization(
&mut tokenizer,
"[ref]test [ref](l)test",
&[
Token {
offset_from: 1,
offset_to: 9,
position: 0,
text: "reftest".to_string(),
position_length: 1,
},
Token {
offset_from: 11,
offset_to: 22,
position: 1,
text: "reftest".to_string(),
position_length: 1,
},
],
);
assert_tokenization(
&mut tokenizer,
"-abc{} \\[34\\] \\] \\) \\} 1 ### abc \\(",
&[
Token {
offset_from: 2,
offset_to: 5,
position: 0,
text: "ref".to_string(),
position_length: 1,
},
Token {
offset_from: 13,
offset_to: 16,
position: 1,
text: "abc".to_string(),
position_length: 1,
},
Token {
offset_from: 35,
offset_to: 36,
position: 2,
text: "1".to_string(),
position_length: 1,
},
Token {
offset_from: 41,
offset_to: 44,
position: 3,
text: "abc".to_string(),
position_length: 1,
},
],
);
}
#[test]
fn test_zh_tokenizer() {
let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register(
"tokenizer",
TextAnalyzer::builder(MmdTokenizer::default())
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.build(),
);
let mut tokenizer = tokenizer_manager.get("tokenizer").unwrap();
assert_tokenization(
&mut tokenizer,
"在查hello, worl土d动!",
&[
Token {
offset_from: 0,
offset_to: 3,
position: 0,
text: "在".to_string(),
position_length: 1,
},
Token {
offset_from: 3,
offset_to: 6,
position: 1,
text: "查".to_string(),
position_length: 1,
},
Token {
offset_from: 6,
offset_to: 11,
position: 2,
text: "hello".to_string(),
position_length: 1,
},
Token {
offset_from: 13,
offset_to: 17,
position: 3,
text: "worl".to_string(),
position_length: 1,
},
Token {
offset_from: 17,
offset_to: 20,
position: 4,
text: "土".to_string(),
position_length: 1,
},
Token {
offset_from: 20,
offset_to: 21,
position: 5,
text: "d".to_string(),
position_length: 1,
},
Token {
offset_from: 21,
offset_to: 24,
position: 6,
text: "动".to_string(),
position_length: 1,
},
],
);
assert_tokenization(
&mut tokenizer,
"在查土d动",
&[
Token {
offset_from: 0,
offset_to: 3,
position: 0,
text: "在".to_string(),
position_length: 1,
},
Token {
offset_from: 3,
offset_to: 6,
position: 1,
text: "查".to_string(),
position_length: 1,
},
Token {
offset_from: 6,
offset_to: 9,
position: 2,
text: "土".to_string(),
position_length: 1,
},
Token {
offset_from: 9,
offset_to: 10,
position: 3,
text: "d".to_string(),
position_length: 1,
},
Token {
offset_from: 10,
offset_to: 13,
position: 4,
text: "动".to_string(),
position_length: 1,
},
],
);
assert_tokenization(
&mut tokenizer,
"Veri 在查hello, c查m p查 查lex worl土d动!",
&[
Token {
offset_from: 0,
offset_to: 4,
position: 0,
text: "veri".to_string(),
position_length: 1,
},
Token {
offset_from: 5,
offset_to: 8,
position: 1,
text: "在".to_string(),
position_length: 1,
},
Token {
offset_from: 8,
offset_to: 11,
position: 2,
text: "查".to_string(),
position_length: 1,
},
Token {
offset_from: 11,
offset_to: 16,
position: 3,
text: "hello".to_string(),
position_length: 1,
},
Token {
offset_from: 18,
offset_to: 19,
position: 4,
text: "c".to_string(),
position_length: 1,
},
Token {
offset_from: 19,
offset_to: 22,
position: 5,
text: "查".to_string(),
position_length: 1,
},
Token {
offset_from: 22,
offset_to: 23,
position: 6,
text: "m".to_string(),
position_length: 1,
},
Token {
offset_from: 24,
offset_to: 25,
position: 7,
text: "p".to_string(),
position_length: 1,
},
Token {
offset_from: 25,
offset_to: 28,
position: 8,
text: "查".to_string(),
position_length: 1,
},
Token {
offset_from: 29,
offset_to: 32,
position: 9,
text: "查".to_string(),
position_length: 1,
},
Token {
offset_from: 32,
offset_to: 35,
position: 10,
text: "lex".to_string(),
position_length: 1,
},
Token {
offset_from: 37,
offset_to: 41,
position: 11,
text: "worl".to_string(),
position_length: 1,
},
Token {
offset_from: 41,
offset_to: 44,
position: 12,
text: "土".to_string(),
position_length: 1,
},
Token {
offset_from: 44,
offset_to: 45,
position: 13,
text: "d".to_string(),
position_length: 1,
},
Token {
offset_from: 45,
offset_to: 48,
position: 14,
text: "动".to_string(),
position_length: 1,
},
],
);
assert_tokenization(&mut tokenizer, "。", &[]);
}
}