use crate::fraction::Fraction;
use ::std::{
fs::File,
io::{self, Read},
path::PathBuf,
};
use brotli::Decompressor;
use debug_unsafe::slice::SliceGetter;
use itertools::Itertools;
use langram::{
model::{Model, ModelNgrams},
IntoEnumIterator, NgramSize,
};
use serde_map::SerdeMap;
use thiserror::Error;
pub type FileModel = (usize, SerdeMap<Fraction, String>);
pub(crate) struct SpaceNgramsUnpacker;
pub(crate) struct ChunksNgramsUnpacker;
pub(crate) trait NgramsUnpacker: Sized {
fn unpack(ngrams: String, ngram_size: NgramSize) -> Vec<String>;
}
impl NgramsUnpacker for ChunksNgramsUnpacker {
#[inline(always)]
fn unpack(ngrams: String, ngram_size: NgramSize) -> Vec<String> {
ngrams
.chars()
.chunks(ngram_size as usize + 1)
.into_iter()
.map(|s| s.collect())
.collect()
}
}
impl NgramsUnpacker for SpaceNgramsUnpacker {
#[inline(always)]
fn unpack(ngrams: String, _ngram_size: NgramSize) -> Vec<String> {
ngrams.split(' ').map(|s| s.to_owned()).collect()
}
}
#[derive(Error, Debug)]
pub enum ModelConversionError {
#[error("Read error")]
Read(#[source] io::Error),
#[error("SerdeEncom error")]
SerdeEncom(#[from] serde_encom::Error),
}
fn read(file: File) -> Result<FileModel, ModelConversionError> {
let mut uncompressed_file = Decompressor::new(file, 4096);
let mut uncompressed_file_content = String::new();
uncompressed_file
.read_to_string(&mut uncompressed_file_content)
.map_err(ModelConversionError::Read)?;
serde_encom::from_str(&uncompressed_file_content).map_err(|e| e.into())
}
fn parse_model<NU: NgramsUnpacker>(file_model: FileModel, ngram_size: NgramSize) -> ModelNgrams {
let iter = file_model.1.into_iter().flat_map(|(fraction, ngrams)| {
let floating_point_value = fraction.to_f64().ln();
let unp = NU::unpack(ngrams, ngram_size);
unp.into_iter()
.map(move |chars| (chars, floating_point_value))
});
iter.collect()
}
pub fn dir_into_model(lang_dir: PathBuf) -> Result<Option<Model>, ModelConversionError> {
if lang_dir.is_dir() {
let mut model = Model::default();
for ngram_size in NgramSize::iter() {
let file_name = crate::into_file_name(ngram_size);
if let Ok(file) = File::open(lang_dir.join(file_name)) {
let file_model = read(file)?;
let ngram_map = if ngram_size == NgramSize::Word {
parse_model::<SpaceNgramsUnpacker>(file_model, ngram_size)
} else {
parse_model::<ChunksNgramsUnpacker>(file_model, ngram_size)
};
*model.get_safe_unchecked_mut(ngram_size as usize) = ngram_map;
}
}
let uni_model = model.get_safe_unchecked(NgramSize::Uni as usize);
if uni_model.is_empty() {
return Ok(None);
}
Ok(Some(model))
} else {
Ok(None)
}
}