#![allow(clippy::suspicious_else_formatting)]
use std::io::Cursor;
use std::io::Read;
use std::io::Seek;
use std::str;
extern crate pathfinding;
mod blob;
mod file;
mod dart;
mod unkchar;
mod userdict;
use self::file::*;
use self::dart::*;
use self::unkchar::*;
use self::userdict::*;
pub use self::blob::Blob;
#[derive(Clone)]
#[derive(Debug)]
pub (crate) struct FormatToken {
left_context : u16,
right_context : u16,
pos : u16,
cost : i64,
original_id : u32,
feature_offset : u32,
}
impl FormatToken {
#[allow(clippy::cast_lossless)]
fn read<T : Read + std::io::Seek>(sysdic : &mut T, original_id : u32) -> Result<FormatToken, &'static str>
{
let ret = FormatToken
{ left_context : read_u16(sysdic)?,
right_context : read_u16(sysdic)?,
pos : read_u16(sysdic)?,
cost : read_i16(sysdic)? as i64,
original_id,
feature_offset : read_u32(sysdic)?,
};
seek_rel_4(sysdic)?;
Ok(ret)
}
}
#[derive(Clone)]
#[derive(Copy)]
#[derive(Debug)]
#[derive(PartialEq)]
#[derive(Eq)]
#[derive(Hash)]
pub enum TokenType {
Normal,
User,
UNK,
BOS,
}
#[derive(Clone)]
#[derive(Debug)]
pub struct LexerToken {
left_context : u16,
right_context : u16,
pos : u16,
pub cost : i64,
pub real_cost : i64,
pub start : usize,
pub end : usize,
lattice_start : usize,
lattice_end : usize,
pub kind : TokenType,
pub original_id : u32,
pub feature_offset : u32,
}
impl LexerToken {
fn from(other : & FormatToken, start : usize, end : usize, lattice_start : usize, lattice_end : usize, kind : TokenType) -> LexerToken
{
LexerToken
{ left_context : other.left_context,
right_context : other.right_context,
pos : other.pos,
cost : other.cost,
real_cost : 0,
original_id : other.original_id,
feature_offset : other.feature_offset,
start,
end,
lattice_start,
lattice_end,
kind
}
}
fn make_bos(start : usize, end : usize, lattice_start : usize, lattice_end : usize) -> LexerToken
{
LexerToken
{ left_context : 0,
right_context : 0,
pos : 0,
cost : 0,
real_cost : 0,
original_id : 0,
feature_offset : 0,
start,
end,
lattice_start,
lattice_end,
kind : TokenType::BOS
}
}
}
#[derive(Clone)]
#[derive(Debug)]
pub struct ParserToken<'text, 'dict> {
pub surface : &'text str,
pub feature : &'dict str,
pub original_id : u32,
pub kind : TokenType,
}
impl<'text, 'dict> ParserToken<'text, 'dict> {
fn build(surface : &'text str, feature : &'dict str, original_id : u32, kind : TokenType) -> Self
{
ParserToken
{ surface,
feature,
original_id,
kind
}
}
}
struct EdgeInfo {
full_cache_enabled : bool,
fast_edge_enabled : bool,
fast_edge_map_left : Vec<u16>,
fast_edge_map_right : Vec<u16>,
fast_edge_left_edges : usize,
fast_matrix_cache : Vec<i16>,
blob : Blob,
}
impl EdgeInfo {
fn new(blob : Blob) -> EdgeInfo
{
EdgeInfo {
full_cache_enabled : false,
fast_edge_enabled : false,
fast_edge_map_left : Vec::new(),
fast_edge_map_right : Vec::new(),
fast_edge_left_edges : 0,
fast_matrix_cache : Vec::new(),
blob
}
}
}
pub struct Dict {
sys_dic : DartDict,
unk_dic : DartDict,
unk_data : UnkChar,
user_dic : Option<UserDict>,
use_space_stripping : bool,
use_unk_forced_processing : bool,
use_unk_greedy_grouping : bool,
use_unk_prefix_grouping : bool,
left_edges : u16,
right_edges : u16,
matrix : EdgeInfo
}
impl Dict {
#[allow(clippy::cast_lossless)]
pub fn load(
sysdic : Blob,
unkdic : Blob,
matrix : Blob,
unkchar : Blob,
) -> Result<Dict, &'static str>
{
let sys_dic = load_mecab_dart_file(sysdic)?;
let unk_dic = load_mecab_dart_file(unkdic)?;
let unk_data = load_char_bin(&mut Cursor::new(unkchar))?;
let mut matrix_cursor = Cursor::new(matrix.as_ref());
let left_edges = read_u16(&mut matrix_cursor)?;
let right_edges = read_u16(&mut matrix_cursor)?;
if sys_dic.left_contexts != left_edges as u32 || sys_dic.right_contexts != right_edges as u32
{
return Err("sys.dic and matrix.bin have inconsistent left/right edge counts");
}
Ok(Dict {
sys_dic,
unk_dic,
unk_data,
user_dic: None,
use_space_stripping : true,
use_unk_forced_processing : true,
use_unk_greedy_grouping : true,
use_unk_prefix_grouping : true,
left_edges,
right_edges,
matrix : EdgeInfo::new(matrix)
})
}
pub fn load_user_dictionary(&mut self, userdic : Blob) -> Result<(), &'static str>
{
let mut userdic = Cursor::new(userdic);
self.user_dic = Some(UserDict::load(&mut userdic)?);
Ok(())
}
pub fn read_feature_string(&self, token : &LexerToken) -> &str
{
self.read_feature_string_by_source(token.kind, token.feature_offset)
}
pub fn read_feature_string_by_source(&self, kind : TokenType, offset : u32) -> &str
{
match kind
{
TokenType::UNK => self.unk_dic.feature_get(offset),
TokenType::Normal | TokenType::BOS => self.sys_dic.feature_get(offset),
TokenType::User => self.user_dic.as_ref().unwrap().feature_get(offset),
}
}
#[allow(clippy::cast_lossless)]
pub fn prepare_fast_matrix_cache(&mut self, fast_left_edges : Vec<u16>, fast_right_edges : Vec<u16>)
{
let mut matrix = &mut self.matrix;
if matrix.full_cache_enabled
{
return;
}
let mut left_map = vec!(!0u16; self.left_edges as usize);
let mut right_map = vec!(!0u16; self.right_edges as usize);
for (i, left) in fast_left_edges.iter().enumerate()
{
left_map[*left as usize] = i as u16;
}
for (i, right) in fast_right_edges.iter().enumerate()
{
right_map[*right as usize] = i as u16;
}
let mut submatrix = vec!(0i16; fast_left_edges.len() * fast_right_edges.len());
for (y, right) in fast_right_edges.iter().enumerate()
{
let mut row = vec!(0i16; self.left_edges as usize);
let location = self.left_edges as u64 * *right as u64;
let mut reader = Cursor::new(&matrix.blob);
reader.seek(std::io::SeekFrom::Start(4 + location*2)).unwrap();
read_i16_buffer(&mut reader, &mut row).unwrap();
for (i, left) in fast_left_edges.iter().enumerate()
{
submatrix[y * fast_left_edges.len() + i] = row[*left as usize];
}
}
matrix.fast_edge_enabled = true;
matrix.fast_edge_map_left = left_map;
matrix.fast_edge_map_right = right_map;
matrix.fast_edge_left_edges = fast_left_edges.len();
matrix.fast_matrix_cache = submatrix;
}
pub fn prepare_full_matrix_cache(&mut self)
{
let mut matrix = &mut self.matrix;
matrix.full_cache_enabled = true;
matrix.fast_edge_enabled = false;
matrix.fast_edge_map_left = Vec::new();
matrix.fast_edge_map_right = Vec::new();
matrix.fast_edge_left_edges = 0;
matrix.fast_matrix_cache = Vec::new();
let size = self.left_edges as usize * self.right_edges as usize;
let mut new_fast_cache = vec!(0; size);
let mut reader = Cursor::new(&matrix.blob);
reader.seek(std::io::SeekFrom::Start(4)).unwrap();
read_i16_buffer(&mut reader, &mut new_fast_cache[..]).unwrap();
matrix.fast_matrix_cache = new_fast_cache;
}
#[allow(clippy::cast_lossless)]
fn access_matrix(&self, left : u16, right : u16) -> i16
{
let matrix = &self.matrix;
if matrix.full_cache_enabled
{
let loc = self.left_edges as usize * right as usize + left as usize;
return matrix.fast_matrix_cache[loc];
}
if matrix.fast_edge_enabled
{
let new_left = matrix.fast_edge_map_left [left as usize];
let new_right = matrix.fast_edge_map_right[right as usize];
if new_left != !0u16 && new_right != !0u16
{
let loc = matrix.fast_edge_left_edges * new_right as usize + new_left as usize;
return matrix.fast_matrix_cache[loc];
}
}
let location = self.left_edges as u32 * right as u32 + left as u32;
let offset = 4 + location as usize * 2;
let cost = &matrix.blob[offset..offset + 2];
i16::from_le_bytes([cost[0], cost[1]])
}
#[allow(clippy::cast_lossless)]
fn calculate_cost(&self, left : &LexerToken, right : &LexerToken) -> i64
{
if left.lattice_end != right.lattice_start
{
panic!("disconnected nodes");
}
if left.right_context >= self.left_edges
{
panic!("bad right_context");
}
if right.left_context >= self.right_edges
{
panic!("bad left_context");
}
right.cost as i64 + self.access_matrix(left.right_context, right.left_context) as i64
}
fn may_contain(&self, find : &str) -> bool
{
self.sys_dic.may_contain(find) || self.user_dic.as_ref().map(|x| x.may_contain(find)).unwrap_or_else(|| false)
}
pub fn set_space_stripping(&mut self, setting : bool) -> bool
{
let prev = self.use_space_stripping;
self.use_space_stripping = setting;
prev
}
pub fn set_unk_forced_processing(&mut self, setting : bool) -> bool
{
let prev = self.use_unk_forced_processing;
self.use_unk_forced_processing = setting;
prev
}
pub fn set_unk_greedy_grouping(&mut self, setting : bool) -> bool
{
let prev = self.use_unk_greedy_grouping;
self.use_unk_greedy_grouping = setting;
prev
}
pub fn set_unk_prefix_grouping(&mut self, setting : bool) -> bool
{
let prev = self.use_unk_prefix_grouping;
self.use_unk_prefix_grouping = setting;
prev
}
}
fn build_lattice_column(dict: &Dict, text : &str, mut start : usize, lattice_len : usize) -> (Vec<LexerToken>, usize)
{
let mut offset = 0;
while dict.use_space_stripping && start < text.len() && text[start..].starts_with(' ')
{
offset += 1;
start += 1;
}
let mut index_iter = text[start..].char_indices();
let mut end = start;
let first_char = match index_iter.next()
{
Some((_, c)) =>
{
end += c.len_utf8();
c
}
None => return (vec!(LexerToken::make_bos(0, 0, lattice_len, lattice_len+1+offset)), offset)
};
let mut substring : &str = &text[start..end];
let mut lattice_column : Vec<LexerToken> = Vec::with_capacity(20);
while dict.may_contain(&substring)
{
if let Some(matching_tokens) = dict.sys_dic.dic_get(&substring)
{
lattice_column.reserve(matching_tokens.len());
for token in matching_tokens
{
lattice_column.push(LexerToken::from(token, start, end, lattice_len, lattice_len+end-start+offset, TokenType::Normal));
}
}
if let Some(user_dic) = &dict.user_dic
{
if let Some(matching_tokens) = user_dic.dic_get(&substring)
{
lattice_column.reserve(matching_tokens.len());
for token in matching_tokens
{
lattice_column.push(LexerToken::from(token, start, end, lattice_len, lattice_len+end-start+offset, TokenType::User));
}
}
}
match index_iter.next()
{
Some((_, c)) =>
{
end += c.len_utf8();
substring = &text[start..end];
}
None => break
}
}
let start_type = &dict.unk_data.get_type(first_char);
if (dict.use_unk_greedy_grouping || dict.use_unk_prefix_grouping)
&& ((dict.use_unk_forced_processing && dict.unk_data.always_process(first_char))
|| lattice_column.is_empty())
{
let mut unk_end = start;
let do_greedy = dict.use_unk_greedy_grouping && start_type.greedy_group;
let do_prefix = dict.use_unk_prefix_grouping && start_type.prefix_group_len > 0;
let mut prefix_len = if do_prefix { start_type.prefix_group_len } else { 0 } as usize;
let mut unk_indices = vec!();
for (_, c) in text[start..].char_indices()
{
if dict.unk_data.has_type(c, start_type.number)
{
unk_end += c.len_utf8();
unk_indices.push(unk_end);
if !do_greedy && unk_indices.len() >= prefix_len
{
break;
}
}
else
{
break;
}
}
prefix_len = std::cmp::min(prefix_len, unk_indices.len());
if let Some(matching_tokens) = dict.unk_dic.dic_get(&start_type.name)
{
lattice_column.reserve(matching_tokens.len() * (start_type.prefix_group_len as usize + start_type.greedy_group as usize));
for token in matching_tokens
{
if do_greedy
{
lattice_column.push(LexerToken::from(token, start, unk_end, lattice_len, lattice_len+unk_end-start+offset, TokenType::UNK));
}
for end in unk_indices[0..prefix_len].iter()
{
lattice_column.push(LexerToken::from(token, start, *end, lattice_len, lattice_len+end-start+offset, TokenType::UNK));
}
}
}
}
let first_char_len = first_char.len_utf8();
let mut build_unknown_single = |name|
{
if lattice_column.is_empty()
{
if let Some(default_tokens) = dict.unk_dic.dic_get(name)
{
if let Some(first_token) = default_tokens.iter().next()
{
lattice_column.push(LexerToken::from(first_token, start, start+first_char_len, lattice_len, lattice_len+first_char_len+offset, TokenType::UNK));
}
}
}
};
build_unknown_single(&start_type.name);
build_unknown_single("DEFAULT");
if lattice_column.is_empty()
{
panic!("unknown chars dictionary has a broken DEFAULT token");
}
(lattice_column, offset)
}
fn build_lattice(dict : &Dict, text : &str) -> Vec<Vec<LexerToken>>
{
let mut lattice : Vec<Vec<LexerToken>> = Vec::with_capacity(text.char_indices().count()+2);
lattice.push(vec!(LexerToken::make_bos(0, 0, lattice.len(), lattice.len()+1)));
let mut skip_until_after = 0;
for i in 0..=text.len()
{
if i < skip_until_after || !text.is_char_boundary(i)
{
lattice.push(Vec::new());
}
else
{
let (column, skipnext) = build_lattice_column(dict, text, i, lattice.len());
skip_until_after = i+skipnext;
lattice.push(column);
}
}
lattice
}
pub fn parse_to_lexertokens(dict : &Dict, text : &str) -> Option<(Vec<LexerToken>, i64)>
{
let lattice = build_lattice(dict, text);
let result = pathfinding::directed::astar::astar(
&(0usize, 0usize),
|&(column, row)|
{
let left = &lattice[column][row];
lattice[left.lattice_end].iter().enumerate().map(move |(row, right)| ((left.lattice_end, row), dict.calculate_cost(left, right)))
},
|&(column, _)|
{
let dist = lattice.len() as i64 - column as i64;
-66000*2*dist
},
|&(column, row)| lattice[column][row].lattice_end == lattice.len()
);
if let Some(result) = result
{
let mut token_events : Vec<LexerToken> = result.0[..].iter().map(|(column, row)| lattice[*column][*row].clone()).collect();
#[allow(clippy::cast_lossless)]
for i in 1..result.0.len()-1
{
let left = &token_events[i-1];
let right = &token_events[i];
let edge_cost = dict.access_matrix(left.right_context, right.left_context);
token_events[i].real_cost = token_events[i].cost + edge_cost as i64;
}
token_events.pop();
token_events.remove(0);
Some((token_events, result.1))
}
else
{
None
}
}
pub fn parse<'dict, 'text>(dict : &'dict Dict, text : &'text str) -> Option<(Vec<ParserToken<'text, 'dict>>, i64)>
{
let result = parse_to_lexertokens(dict, &text);
if let Some(result) = result
{
let mut lexeme_events : Vec<ParserToken> = Vec::with_capacity(result.0.len());
for token in result.0
{
let surface = &text[token.start..token.end];
let feature = dict.read_feature_string(&token);
lexeme_events.push(ParserToken::build(surface, feature, token.original_id, token.kind));
}
Some((lexeme_events, result.1))
}
else
{
None
}
}
#[cfg(test)]
mod tests {
use std::fs::File;
use super::*;
fn assert_implements_sync<T>() where T: Sync {}
fn assert_implements_send<T>() where T: Send {}
fn tokenstream_to_string(stream : &Vec<ParserToken>, comma : &str) -> String
{
let mut ret = String::new();
let mut first = true;
for token in stream
{
if !first
{
ret += comma;
}
ret += &token.surface;
first = false;
}
ret
}
fn assert_parse(dict : &Dict, input : &str, truth : &str)
{
println!("testing parse...");
let result = parse(dict, input).unwrap();
for token in &result.0
{
println!("{}", token.feature);
}
let split_up_string = tokenstream_to_string(&result.0, "|");
println!("{}", split_up_string);
assert_eq!(split_up_string, truth);
}
fn file_to_string(file : &mut File) -> String
{
let mut text = String::new();
file.read_to_string(&mut text).unwrap();
text
}
#[test]
fn test_various()
{
assert_implements_sync::<Dict>();
assert_implements_send::<Dict>();
let sysdic = Blob::open("data/sys.dic").unwrap();
let unkdic = Blob::open("data/unk.dic").unwrap();
let matrix = Blob::open("data/matrix.bin").unwrap();
let unkdef = Blob::open("data/char.bin").unwrap();
let mut dict = Dict::load(sysdic, unkdic, matrix, unkdef).unwrap();
assert_parse(&dict,
"これを持っていけ",
"これ|を|持っ|て|いけ"
);
assert_parse(&dict,
"メタプログラミング (metaprogramming) とはプログラミング技法の一種で、ロジックを直接コーディングするのではなく、あるパターンをもったロジックを生成する高位ロジックによってプログラミングを行う方法、またその高位ロジックを定義する方法のこと。主に対象言語に埋め込まれたマクロ言語によって行われる。",
"メタ|プログラミング|(|metaprogramming|)|と|は|プログラミング|技法|の|一種|で|、|ロジック|を|直接|コーディング|する|の|で|は|なく|、|ある|パターン|を|もっ|た|ロジック|を|生成|する|高位|ロジック|に|よっ|て|プログラミング|を|行う|方法|、|また|その|高位|ロジック|を|定義|する|方法|の|こと|。|主に|対象|言語|に|埋め込ま|れ|た|マクロ|言語|に|よっ|て|行わ|れる|。"
);
assert_parse(&dict,
"Lorem sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.",
"Lorem|s|i|t|a|m|e|t|,|consectetur|adipiscing|elit|,|sed|do|eiusmod|tempor|incididunt|u|t|l|a|b|o|r|e|e|t|dolore|magna|aliqua|."
);
assert_parse(&dict,
"だっでおら、こんな、こんなにっ!飛車角のこと、好きなんだ!!!!!!",
"だっ|で|おら|、|こんな|、|こんな|に|っ|!|飛車|角|の|こと|、|好き|な|ん|だ|!|!|!|!|!|!"
);
assert_parse(&dict, "噛", "噛");
assert_parse(&dict, "噛 ", "噛");
assert_parse(&dict, "噛\n", "噛|\n");
dict.set_space_stripping(false);
assert_parse(&dict, "a b", "a| |b");
dict.set_space_stripping(true);
assert_parse(&dict, "」 ", "」");
assert_parse(&dict, "噛噛", "噛噛");
dict.set_unk_prefix_grouping(false);
dict.set_unk_greedy_grouping(false);
assert_parse(&dict, "噛噛", "噛|噛");
dict.set_unk_prefix_grouping(true);
dict.set_unk_greedy_grouping(true);
assert_parse(&dict, "programmprogram", "programmprogram");
dict.set_unk_forced_processing(false);
assert_parse(&dict, "programmprogram", "program|m|program");
dict.set_unk_forced_processing(true);
assert_parse(&dict, "𛁁", "𛁁");
assert_parse(&dict, "飛行機", "飛行|機");
dict.load_user_dictionary(Blob::open("data/userdict.csv").unwrap()).unwrap();
assert_parse(&dict, "飛行機", "飛行機");
if let Ok(mut common_left_edge_file) = File::open("data/common_edges_left.txt")
{
if let Ok(mut common_right_edge_file) = File::open("data/common_edges_right.txt")
{
let fast_edges_left_text = file_to_string(&mut common_left_edge_file);
let fast_edges_right_text = file_to_string(&mut common_right_edge_file);
let fast_edges_left = fast_edges_left_text .lines().map(|x| x.parse::<u16>().unwrap()).collect::<Vec<_>>();
let fast_edges_right = fast_edges_right_text.lines().map(|x| x.parse::<u16>().unwrap()).collect::<Vec<_>>();
dict.prepare_fast_matrix_cache(fast_edges_left, fast_edges_right);
assert_parse(&dict,
"メタプログラミング (metaprogramming) とはプログラミング技法の一種で、ロジックを直接コーディングするのではなく、あるパターンをもったロジックを生成する高位ロジックによってプログラミングを行う方法、またその高位ロジックを定義する方法のこと。主に対象言語に埋め込まれたマクロ言語によって行われる。",
"メタ|プログラミング|(|metaprogramming|)|と|は|プログラミング|技法|の|一種|で|、|ロジック|を|直接|コーディング|する|の|で|は|なく|、|ある|パターン|を|もっ|た|ロジック|を|生成|する|高位|ロジック|に|よっ|て|プログラミング|を|行う|方法|、|また|その|高位|ロジック|を|定義|する|方法|の|こと|。|主に|対象|言語|に|埋め込ま|れ|た|マクロ|言語|に|よっ|て|行わ|れる|。"
);
}
}
}
}