use byteorder::{LittleEndian, WriteBytesExt};
use lindera_core::{
error::LinderaErrorKind,
prefix_dict::PrefixDict,
word_entry::{WordEntry, WordId},
LinderaResult,
};
use rayon::prelude::*;
use std::{collections::BTreeMap, str::FromStr};
use yada::{builder::DoubleArrayBuilder, DoubleArray};
use jpreprocess_dictionary::DictionarySerializer;
const SIMPLE_USERDIC_FIELDS_NUM: usize = 3;
const DETAILED_USERDIC_FIELDS_NUM: usize = 13;
const SIMPLE_WORD_COST: i16 = -10000;
const SIMPLE_CONTEXT_ID: u16 = 0;
pub type WordEntryMap = BTreeMap<String, Vec<WordEntry>>;
pub fn normalize_rows<'a, T, U, V>(rows: &'a T) -> Vec<Vec<String>>
where
T: IntoParallelRefIterator<'a, Item = U>,
U: 'a + IntoIterator<Item = &'a V>,
V: 'a + ToString + ?Sized,
{
rows.par_iter()
.map(|row| {
row.into_iter()
.map(|column| column.to_string().replace('―', "—").replace('~', "〜"))
.collect()
})
.collect()
}
pub fn build_word_entry_map(
rows: &Vec<Vec<String>>,
is_system: bool,
) -> LinderaResult<WordEntryMap> {
let entries = rows
.par_iter()
.enumerate()
.map(|(row_id, row)| {
let is_simple = !is_system && row.len() == SIMPLE_USERDIC_FIELDS_NUM;
if is_simple {
Ok(WordEntryWithString::simple(
row_id as u32,
row[0].to_string(),
))
} else {
WordEntryWithString::new(row_id as u32, row, is_system)
}
})
.collect::<Result<Vec<WordEntryWithString>, _>>()?;
let mut word_entry_map: BTreeMap<String, Vec<WordEntry>> = BTreeMap::new();
for entry in entries {
word_entry_map
.entry(entry.surface)
.or_default()
.push(entry.word_entry);
}
Ok(word_entry_map)
}
pub fn build_prefix_dict(
word_entry_map: WordEntryMap,
is_system: bool,
) -> LinderaResult<PrefixDict> {
let mut id = 0u32;
let mut keyset: Vec<(&[u8], u32)> = vec![];
for (key, word_entries) in &word_entry_map {
let len = word_entries.len() as u32;
let val = (id << 5) | len;
keyset.push((key.as_bytes(), val));
id += len;
}
let da_bytes = DoubleArrayBuilder::build(&keyset).ok_or_else(|| {
LinderaErrorKind::Io.with_error(anyhow::anyhow!("DoubleArray build error for user dict."))
})?;
let mut vals_data = Vec::<u8>::new();
for word_entries in word_entry_map.values() {
for word_entry in word_entries {
word_entry
.serialize(&mut vals_data)
.map_err(|err| LinderaErrorKind::Serialize.with_error(anyhow::anyhow!(err)))?;
}
}
Ok(PrefixDict {
da: DoubleArray::new(da_bytes),
vals_data,
is_system,
})
}
pub struct WordEntryWithString {
surface: String,
word_entry: WordEntry,
}
impl WordEntryWithString {
pub fn new(row_id: u32, row: &[String], is_system: bool) -> LinderaResult<Self> {
Ok(Self {
surface: row[0].to_string(),
word_entry: WordEntry {
word_id: WordId(row_id, is_system),
word_cost: i16::from_str(row[3].trim()).map_err(|_err| {
LinderaErrorKind::Parse.with_error(anyhow::anyhow!("failed to parse word_cost"))
})?,
left_id: u16::from_str(row[1].trim()).map_err(|_err| {
LinderaErrorKind::Parse.with_error(anyhow::anyhow!("failed to parse cost_id"))
})?,
right_id: u16::from_str(row[2].trim()).map_err(|_err| {
LinderaErrorKind::Parse.with_error(anyhow::anyhow!("failed to parse cost_id"))
})?,
},
})
}
pub fn simple(row_id: u32, string: String) -> Self {
Self {
surface: string,
word_entry: WordEntry {
word_id: WordId(row_id, false),
word_cost: SIMPLE_WORD_COST,
left_id: SIMPLE_CONTEXT_ID,
right_id: SIMPLE_CONTEXT_ID,
},
}
}
}
pub fn build_words<S: DictionarySerializer + Send + Sync>(
serializer: &S,
rows: &Vec<Vec<String>>,
is_system: bool,
) -> LinderaResult<(Vec<u8>, Vec<u8>)> {
let mut words = rows
.par_iter()
.map(|row| {
if is_system || row.len() >= DETAILED_USERDIC_FIELDS_NUM {
serializer.serialize(&row[4..])
} else if row.len() == SIMPLE_USERDIC_FIELDS_NUM {
serializer.serialize_simple(row)
} else {
Err(LinderaErrorKind::Content.with_error(anyhow::anyhow!(
"user dictionary should be a CSV with {} or {}+ fields",
SIMPLE_USERDIC_FIELDS_NUM,
DETAILED_USERDIC_FIELDS_NUM
)))
}
})
.collect::<Result<Vec<Vec<u8>>, _>>()?;
words.insert(0, serializer.identifier().as_bytes().to_vec());
let words_idx: Vec<usize> = words
.iter()
.scan(0, |acc, e| {
let offset = *acc;
*acc += e.len();
Some(offset)
})
.collect();
let mut words_idx_buffer = Vec::with_capacity(words_idx.len() * 4);
for word_idx in words_idx.iter().skip(1) {
words_idx_buffer
.write_u32::<LittleEndian>(*word_idx as u32)
.map_err(|err| LinderaErrorKind::Io.with_error(err))?;
}
let words_buffer = words.concat();
Ok((words_idx_buffer, words_buffer))
}