use std::collections::HashMap;
use std::convert::TryFrom;
use std::fs::File;
use std::io::{BufReader, Read, Seek, SeekFrom};
use std::path::Path;
use std::sync::Arc;
use memmap2::Mmap;
use minilzo_rs::adler32;
use parking_lot::{RwLock, RwLockReadGuard};
use crate::index::{btree::BTreeIndex, fts::FtsIndex, Index, IndexConfig};
use crate::traits::{
BatchResult, Dict, DictConfig, DictError, DictMetadata, DictStats, EntryIterator,
HighPerformanceDict, Result, SearchResult,
};
use crate::util::buffer::{self, read_string, read_u32_le, read_varint};
use crate::util::compression::{self, CompressionAlgorithm};
use crate::util::encoding::{self, TextEncoding};
use crate::util::file_utils;
const MDICT_MAX_KEY_LENGTH: usize = 16 * 1024;
const MDICT_MAX_VALUE_LENGTH: usize = 4 * 1024 * 1024; const MDICT_MAX_HEADER_TEXT: usize = 512 * 1024;
const MDICT_MAX_BLOCK_INFO: usize = 16 * 1024 * 1024;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum MdictBlockCompression {
None,
Lzo,
Zlib,
}
#[derive(Debug, Clone)]
struct MdictHeader {
encoding: String,
version: f64,
encrypted: i32,
rtl: bool,
title: String,
description: String,
attributes: HashMap<String, String>,
number_size: u8,
headword_block_info_pos: u64,
headword_block_info_size: u64,
num_headword_blocks: u64,
word_count: u64,
headword_block_size: u64,
record_block_info_pos: u64,
total_records_size: u64,
record_blocks: Vec<RecordIndex>,
file_size: u64,
}
#[derive(Debug, Clone)]
struct RecordIndex {
compressed_size: u64,
decompressed_size: u64,
start_pos: u64,
shadow_start_pos: u64,
shadow_end_pos: u64,
}
impl RecordIndex {
fn contains_decompressed_offset(&self, off: u64) -> bool {
off >= self.shadow_start_pos && off < self.shadow_end_pos
}
}
#[derive(Debug, Clone)]
struct MdictKeyEntry {
key: String,
record_offset: u64,
record_size: u64,
}
pub struct MDict {
file_path: std::path::PathBuf,
mmap: Option<Arc<Mmap>>,
file: Option<File>,
header: MdictHeader,
btree_index: Option<BTreeIndex>,
fts_index: Option<FtsIndex>,
entry_cache: Arc<RwLock<lru_cache::LruCache<String, Vec<u8>>>>,
config: DictConfig,
metadata: DictMetadata,
}
impl MDict {
pub fn new<P: AsRef<Path>>(path: P, config: DictConfig) -> Result<Self> {
let path = path.as_ref();
let file_path = path.to_path_buf();
if !path.exists() {
return Err(DictError::FileNotFound(path.display().to_string()));
}
let file = File::open(path).map_err(|e| DictError::IoError(e.to_string()))?;
let header = Self::read_header(&file, path)?;
let mmap = if config.use_mmap {
Some(Arc::new(unsafe {
memmap2::MmapOptions::new()
.map(&file)
.map_err(|e| DictError::MmapError(e.to_string()))?
}))
} else {
None
};
let (btree_index, fts_index) = if config.load_btree || config.load_fts {
Self::load_indexes(&file_path, &config, &header)?
} else {
(None, None)
};
if config.load_btree && btree_index.is_none() {
return Err(DictError::UnsupportedOperation(
"MDict requires an existing B-TREE sidecar for lookups; none found".to_string(),
));
}
let entry_cache = Arc::new(RwLock::new(lru_cache::LruCache::new(config.cache_size)));
let file_size = path.metadata().map(|m| m.len()).unwrap_or(0);
let name = header
.attributes
.get("Title")
.cloned()
.unwrap_or_else(|| "MDict".to_string());
let metadata = DictMetadata {
name,
version: format!("{}", header.version),
entries: header.word_count,
description: Some(header.description.clone()).filter(|s| !s.is_empty()),
author: header.attributes.get("Author").cloned(),
language: header
.attributes
.get("DictCharset")
.cloned()
.or_else(|| header.attributes.get("Encoding").cloned()),
file_size,
created: header.attributes.get("CreationDate").cloned(),
has_btree: btree_index.is_some(),
has_fts: fts_index.is_some(),
};
Ok(Self {
file_path,
mmap,
file: Some(file),
header,
btree_index,
fts_index,
entry_cache,
config,
metadata,
})
}
fn read_header(file: &File, path: &Path) -> Result<MdictHeader> {
fn adler32(bytes: &[u8]) -> u32 {
const MOD_ADLER: u32 = 65521;
let mut a: u32 = 1;
let mut b: u32 = 0;
for &byte in bytes {
a = (a + byte as u32) % MOD_ADLER;
b = (b + a) % MOD_ADLER;
}
(b << 16) | a
}
let mut reader = BufReader::new(file);
let header_text_size = buffer::read_u32_be(&mut reader)? as usize;
if header_text_size == 0 || header_text_size > MDICT_MAX_HEADER_TEXT {
return Err(DictError::InvalidFormat(format!(
"Invalid MDX header size: {}",
header_text_size
)));
}
let mut header_text_raw = vec![0u8; header_text_size];
reader.read_exact(&mut header_text_raw)?;
let checksum_le = buffer::read_u32_le(&mut reader)?;
let calc = adler32(&header_text_raw);
if calc != checksum_le {
return Err(DictError::InvalidFormat(
"MDX header checksum mismatch".to_string(),
));
}
let u16_len = header_text_raw.len() / 2;
let mut u16_buf = Vec::with_capacity(u16_len);
for i in 0..u16_len {
let lo = header_text_raw[2 * i] as u16;
let hi = header_text_raw[2 * i + 1] as u16;
u16_buf.push(lo | (hi << 8));
}
let header_text = String::from_utf16_lossy(&u16_buf);
let attributes = parse_mdict_header_attributes(&header_text);
let mut encoding = attributes
.get("Encoding")
.cloned()
.unwrap_or_else(|| "UTF-16LE".to_string());
if encoding.eq_ignore_ascii_case("GBK") || encoding.eq_ignore_ascii_case("GB2312") {
encoding = "GB18030".to_string();
} else if encoding.is_empty() || encoding.eq_ignore_ascii_case("UTF-16") {
encoding = "UTF-16LE".to_string();
}
let version = attributes
.get("GeneratedByEngineVersion")
.and_then(|v| v.parse::<f64>().ok())
.unwrap_or(1.0);
let number_size: u8 = if version < 2.0 { 4 } else { 8 };
let encrypted = attributes
.get("Encrypted")
.and_then(|v| v.parse::<i32>().ok())
.unwrap_or(0);
let rtl = attributes
.get("Left2Right")
.map(|v| v != "Yes")
.unwrap_or(false);
let title_attr = attributes.get("Title").cloned().unwrap_or_default();
let title = if title_attr.is_empty()
|| title_attr.len() < 5
|| title_attr == "Title (No HTML code allowed)"
{
path.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("MDict")
.to_string()
} else {
strip_html_like(&title_attr)
};
let description_attr = attributes.get("Description").cloned().unwrap_or_default();
let description = strip_html_like(&description_attr);
let (num_headword_blocks, word_count, headword_block_info_size, headword_block_size) =
Self::read_headword_block_info_header(&mut reader, number_size, version)?;
let headword_block_info_pos = reader.stream_position()?;
let mut headword_block_info_compressed = vec![0u8; headword_block_info_size as usize];
reader.read_exact(&mut headword_block_info_compressed)?;
if encrypted & 2 != 0 {
Self::decrypt_headword_index(&mut headword_block_info_compressed)?;
}
let headword_block_info = Self::decompress_block(
&headword_block_info_compressed,
version,
MDICT_MAX_BLOCK_INFO,
)?;
let headword_blocks =
Self::decode_headword_block_info(&headword_block_info, number_size, &encoding)?;
let (record_blocks, total_records_size) =
Self::read_record_block_infos(&mut reader, number_size)?;
Ok(MdictHeader {
encoding,
version,
encrypted,
rtl,
title,
description,
attributes,
number_size,
headword_block_info_pos,
headword_block_info_size,
num_headword_blocks,
word_count,
headword_block_size,
record_block_info_pos: 0, total_records_size,
record_blocks,
file_size: path.metadata().map(|m| m.len()).unwrap_or(0),
})
}
fn read_headword_block_info_header<R: Read + Seek>(
reader: &mut R,
number_size: u8,
version: f64,
) -> Result<(u64, u64, u64, u64)> {
let header_size = if version >= 2.0 {
number_size as u64 * 5 } else {
number_size as u64 * 4 };
let mut header_buf = vec![0u8; header_size as usize];
reader.read_exact(&mut header_buf)?;
let mut cursor = std::io::Cursor::new(header_buf);
let num_blocks = if number_size == 8 {
buffer::read_u64_be(&mut cursor)?
} else {
buffer::read_u32_be(&mut cursor)? as u64
};
let word_count = if number_size == 8 {
buffer::read_u64_be(&mut cursor)?
} else {
buffer::read_u32_be(&mut cursor)? as u64
};
let _decompressed_size = if version >= 2.0 {
if number_size == 8 {
buffer::read_u64_be(&mut cursor)?
} else {
buffer::read_u32_be(&mut cursor)? as u64
}
} else {
0
};
let compressed_size = if number_size == 8 {
buffer::read_u64_be(&mut cursor)?
} else {
buffer::read_u32_be(&mut cursor)? as u64
};
let block_size = if number_size == 8 {
buffer::read_u64_be(&mut cursor)?
} else {
buffer::read_u32_be(&mut cursor)? as u64
};
if version >= 2.0 {
let checksum = buffer::read_u32_le(reader)?;
let calc = adler32(&cursor.get_ref()[..(number_size as usize * 5)]);
if calc != checksum {
return Err(DictError::InvalidFormat(
"Headword block info header checksum mismatch".to_string(),
));
}
}
Ok((num_blocks, word_count, compressed_size, block_size))
}
fn decrypt_headword_index(buffer: &mut [u8]) -> Result<()> {
if buffer.len() < 8 {
return Err(DictError::InvalidFormat(
"Buffer too small for decryption".to_string(),
));
}
let mut key = [0u8; 16];
key[0..4].copy_from_slice(&buffer[0..4]);
key[4..8].copy_from_slice(&[0x95, 0x36, 0x00, 0x00]);
for i in 8..16 {
key[i] = (i as u8) ^ 0x36;
}
let mut prev = 0x36u8;
for i in 8..buffer.len() {
let mut byte = buffer[i];
byte = (byte >> 4) | (byte << 4); byte = byte ^ prev ^ ((i - 8) as u8 & 0xFF) ^ key[(i - 8) % 16];
prev = buffer[i];
buffer[i] = byte;
}
Ok(())
}
fn decompress_block(compressed: &[u8], version: f64, max_output: usize) -> Result<Vec<u8>> {
if compressed.len() < 8 {
return Err(DictError::InvalidFormat(
"Compressed block too small".to_string(),
));
}
if compressed.len() > max_output.saturating_mul(8) {
return Err(DictError::InvalidFormat(
"Compressed block size exceeds safety limit".to_string(),
));
}
let compression_type = buffer::read_u32_be(&mut std::io::Cursor::new(&compressed[0..4]))?;
let checksum = buffer::read_u32_le(&mut std::io::Cursor::new(&compressed[4..8]))?;
let data = &compressed[8..];
let decompressed = match compression_type {
0x00000000 => {
if data.len() > max_output {
return Err(DictError::InvalidFormat(
"Uncompressed block exceeds safety limit".to_string(),
));
}
if !Self::check_adler32(data, checksum) {
return Err(DictError::InvalidFormat(
"Adler-32 checksum mismatch for uncompressed data".to_string(),
));
}
data.to_vec()
}
0x01000000 => {
let mut lzo = minilzo_rs::LZO::init().map_err(|e| {
DictError::DecompressionError(format!("LZO initialization failed: {:?}", e))
})?;
let estimated_size = std::cmp::min(data.len().saturating_mul(4), max_output);
let decompressed = lzo.decompress_safe(data, estimated_size).map_err(|e| {
DictError::DecompressionError(format!("LZO decompression failed: {:?}", e))
})?;
if decompressed.len() > max_output {
return Err(DictError::InvalidFormat(
"LZO block exceeds safety limit".to_string(),
));
}
if !Self::check_adler32(&decompressed, checksum) {
return Err(DictError::InvalidFormat(
"Adler-32 checksum mismatch for LZO data".to_string(),
));
}
decompressed
}
0x02000000 => {
use flate2::read::ZlibDecoder;
use std::io::Read;
let mut decoder = ZlibDecoder::new(data);
let mut decompressed = Vec::new();
decoder.read_to_end(&mut decompressed).map_err(|e| {
DictError::DecompressionError(format!("Zlib decompression failed: {}", e))
})?;
if decompressed.len() > max_output {
return Err(DictError::InvalidFormat(
"Zlib block exceeds safety limit".to_string(),
));
}
if !Self::check_adler32(&decompressed, checksum) {
return Err(DictError::InvalidFormat(
"Adler-32 checksum mismatch for zlib data".to_string(),
));
}
decompressed
}
_ => {
return Err(DictError::InvalidFormat(format!(
"Unknown compression type: 0x{:08X}",
compression_type
)));
}
};
Ok(decompressed)
}
fn check_adler32(data: &[u8], expected: u32) -> bool {
const MOD_ADLER: u32 = 65521;
let mut a: u32 = 1;
let mut b: u32 = 0;
for &byte in data {
a = (a + byte as u32) % MOD_ADLER;
b = (b + a) % MOD_ADLER;
}
let actual = (b << 16) | a;
actual == expected
}
fn decode_headword_block_info(
data: &[u8],
number_size: u8,
encoding: &str,
) -> Result<Vec<(u64, u64)>> {
let mut blocks = Vec::new();
let mut cursor = std::io::Cursor::new(data);
let is_u16 = encoding == "UTF-16LE";
let term_size = if is_u16 { 2 } else { 1 };
while cursor.position() < data.len() as u64 {
if number_size == 8 {
buffer::read_u64_be(&mut cursor)?;
} else {
buffer::read_u32_be(&mut cursor)?;
}
let first_size = if is_u16 {
buffer::read_u16_be(&mut cursor)? as u64
} else {
buffer::read_u8(&mut cursor)? as u64
};
cursor.seek(std::io::SeekFrom::Current(
(first_size + term_size as u64) as i64,
))?;
let last_size = if is_u16 {
buffer::read_u16_be(&mut cursor)? as u64
} else {
buffer::read_u8(&mut cursor)? as u64
};
cursor.seek(std::io::SeekFrom::Current(
(last_size + term_size as u64) as i64,
))?;
let compressed_size = if number_size == 8 {
buffer::read_u64_be(&mut cursor)?
} else {
buffer::read_u32_be(&mut cursor)? as u64
};
let decompressed_size = if number_size == 8 {
buffer::read_u64_be(&mut cursor)?
} else {
buffer::read_u32_be(&mut cursor)? as u64
};
blocks.push((compressed_size, decompressed_size));
}
Ok(blocks)
}
fn read_record_block_infos<R: Read + Seek>(
reader: &mut R,
number_size: u8,
) -> Result<(Vec<RecordIndex>, u64)> {
let num_blocks = if number_size == 8 {
buffer::read_u64_be(reader)?
} else {
buffer::read_u32_be(reader)? as u64
};
let total_records = if number_size == 8 {
buffer::read_u64_be(reader)?
} else {
buffer::read_u32_be(reader)? as u64
};
let info_size = if number_size == 8 {
buffer::read_u64_be(reader)?
} else {
buffer::read_u32_be(reader)? as u64
};
let total_decompressed_size = if number_size == 8 {
buffer::read_u64_be(reader)?
} else {
buffer::read_u32_be(reader)? as u64
};
let record_block_info_pos = reader.stream_position()?;
let mut info_compressed = vec![0u8; info_size as usize];
reader.read_exact(&mut info_compressed)?;
let info_decompressed =
Self::decompress_block(&info_compressed, 2.0, MDICT_MAX_BLOCK_INFO)?;
let mut record_blocks = Vec::with_capacity(num_blocks as usize);
let mut cursor = std::io::Cursor::new(info_decompressed);
let mut acc_compressed = 0u64;
let mut acc_decompressed = 0u64;
for _ in 0..num_blocks {
let compressed_size = if number_size == 8 {
buffer::read_u64_be(&mut cursor)?
} else {
buffer::read_u32_be(&mut cursor)? as u64
};
let decompressed_size = if number_size == 8 {
buffer::read_u64_be(&mut cursor)?
} else {
buffer::read_u32_be(&mut cursor)? as u64
};
let record_index = RecordIndex {
compressed_size,
decompressed_size,
start_pos: acc_compressed,
shadow_start_pos: acc_decompressed,
shadow_end_pos: acc_decompressed + decompressed_size,
};
record_blocks.push(record_index);
acc_compressed += compressed_size;
acc_decompressed += decompressed_size;
}
Ok((record_blocks, total_decompressed_size))
}
fn load_indexes(
path: &Path,
config: &DictConfig,
_header: &MdictHeader,
) -> Result<(Option<BTreeIndex>, Option<FtsIndex>)> {
let mut btree_index = None;
let mut fts_index = None;
let base_name = path.with_extension("");
let btree_path = base_name.with_extension("btree");
let fts_path = base_name.with_extension("fts");
if config.load_btree && btree_path.exists() {
let mut btree = BTreeIndex::new();
if let Err(e) = btree.load(&btree_path) {
return Err(DictError::IndexError(format!(
"Failed to load MDict B-TREE index {}: {}",
btree_path.display(),
e
)));
}
if !btree.is_built() {
return Err(DictError::IndexError(format!(
"MDict B-TREE index {} is not built or is empty",
btree_path.display()
)));
}
btree_index = Some(btree);
}
if config.load_fts && fts_path.exists() {
let mut fts = FtsIndex::new();
if let Err(e) = fts.load(&fts_path) {
return Err(DictError::IndexError(format!(
"Failed to load MDict FTS index {}: {}",
fts_path.display(),
e
)));
}
if !fts.is_built() {
return Err(DictError::IndexError(format!(
"MDict FTS index {} is not built or is empty",
fts_path.display()
)));
}
fts_index = Some(fts);
}
Ok((btree_index, fts_index))
}
fn validate_entry_window(&self, offset: u64, length: u64) -> Result<(u64, usize)> {
if length as usize > MDICT_MAX_VALUE_LENGTH {
return Err(DictError::Internal(format!(
"Entry too large: {} bytes",
length
)));
}
let end = offset
.checked_add(length)
.ok_or_else(|| DictError::InvalidFormat("Entry offset/length overflow".to_string()))?;
if end > self.header.file_size {
return Err(DictError::InvalidFormat(format!(
"Entry range {}..{} exceeds file size {}",
offset, end, self.header.file_size
)));
}
let len_usize = usize::try_from(length).map_err(|_| {
DictError::InvalidFormat("Entry length does not fit in memory".to_string())
})?;
Ok((end, len_usize))
}
fn read_entry_at_offset(&self, offset: u64, length: u64) -> Result<Vec<u8>> {
let (end, len_usize) = self.validate_entry_window(offset, length)?;
if len_usize == 0 {
return Ok(Vec::new());
}
let data = if let Some(ref mmap) = self.mmap {
if end > mmap.len() as u64 {
return Err(DictError::IoError("Read past mapped file".to_string()));
}
mmap[offset as usize..end as usize].to_vec()
} else if let Some(ref file) = self.file {
let mut reader = BufReader::new(file);
reader.seek(SeekFrom::Start(offset))?;
let mut buffer = vec![0u8; len_usize];
reader.read_exact(&mut buffer)?;
buffer
} else {
return Err(DictError::Internal("No file handle available".to_string()));
};
let converted_data = convert_entry_data_if_needed(&data)?;
Ok(converted_data)
}
fn binary_search_lookup(&self, key: &str) -> Result<Option<Vec<u8>>> {
if let Some(ref btree) = self.btree_index {
if let Some((data, _offset)) = btree.binary_search(key)? {
Ok(Some(data))
} else {
Ok(None)
}
} else {
self.sequential_search(key)
}
}
fn sequential_search(&self, key: &str) -> Result<Option<Vec<u8>>> {
if !self.header.record_blocks.is_empty() {
if let Some(ref btree) = self.btree_index {
return self.binary_search_lookup(key);
}
return Ok(None);
}
Ok(None)
}
fn get_cached(&self, key: &str) -> Option<Vec<u8>> {
let mut cache = self.entry_cache.write();
if let Some(value) = cache.get_mut(&key.to_string()) {
Some(value.clone())
} else {
None
}
}
fn cache_entry(&self, key: String, value: Vec<u8>) {
let mut cache = self.entry_cache.write();
cache.insert(key, value);
}
pub fn build_indexes(&mut self) -> Result<()> {
if !self.config.load_btree && !self.config.load_fts {
return Ok(());
}
let base_btree =
match &self.btree_index {
Some(idx) => idx,
None => return Err(DictError::UnsupportedOperation(
"MDict index building requires an existing B-TREE index for key enumeration"
.to_string(),
)),
};
let mut entries: Vec<(String, Vec<u8>)> = Vec::new();
let mut btree_sidecar: Vec<(String, Vec<u8>)> = Vec::new();
let all = base_btree.range_query("", "\u{10FFFF}")?;
for (key, offset) in all {
if let Ok(value) = self.get(&key) {
entries.push((key.clone(), value));
btree_sidecar.push((key, offset.to_le_bytes().to_vec()));
}
}
if entries.is_empty() {
return Err(DictError::UnsupportedOperation(
"MDict index building: no entries could be enumerated from existing index"
.to_string(),
));
}
if self.config.load_btree {
let mut btree = BTreeIndex::new();
let index_config = IndexConfig::default();
btree.build(&btree_sidecar, &index_config)?;
if !btree.is_built() {
return Err(DictError::IndexError(
"MDict B-TREE index build produced an empty index".to_string(),
));
}
let btree_path = self.file_path.with_extension("btree");
btree.save(&btree_path)?;
self.btree_index = Some(btree);
}
if self.config.load_fts {
let mut fts = FtsIndex::new();
let index_config = IndexConfig::default();
fts.build(&entries, &index_config)?;
if !fts.is_built() {
return Err(DictError::IndexError(
"MDict FTS index build produced an empty index".to_string(),
));
}
let fts_path = self.file_path.with_extension("fts");
fts.save(&fts_path)?;
self.fts_index = Some(fts);
}
Ok(())
}
fn collect_all_entries(&self) -> Result<Vec<(String, Vec<u8>)>> {
let mut out = Vec::new();
if let Some(btree) = &self.btree_index {
let all = btree.range_query("", "\u{10FFFF}")?;
for (key, _off) in all {
if let Ok(val) = self.get(&key) {
out.push((key, val));
}
}
}
Ok(out)
}
pub fn file_paths(&self) -> Vec<std::path::PathBuf> {
let mut paths = vec![self.file_path.clone()];
if let Some(ref _btree) = self.btree_index {
paths.push(self.file_path.with_extension("btree"));
}
if let Some(ref _fts) = self.fts_index {
paths.push(self.file_path.with_extension("fts"));
}
paths
}
}
impl Dict<String> for MDict {
fn metadata(&self) -> &DictMetadata {
&self.metadata
}
fn contains(&self, key: &String) -> Result<bool> {
match self.get(key) {
Ok(_) => Ok(true),
Err(DictError::IndexError(_)) => Ok(false),
Err(_) => Ok(false),
}
}
fn get(&self, key: &String) -> Result<Vec<u8>> {
if let Some(cached) = self.get_cached(key) {
return Ok(cached);
}
match self.binary_search_lookup(key) {
Ok(Some(data)) => {
self.cache_entry(key.clone(), data.clone());
Ok(data)
}
Ok(None) => Err(DictError::IndexError("Key not found".to_string())),
Err(e) => Err(e),
}
}
fn search_prefix(&self, prefix: &str, limit: Option<usize>) -> Result<Vec<SearchResult>> {
let limit = limit.unwrap_or(100);
let mut results = Vec::new();
if let Some(ref btree) = self.btree_index {
let range_results = btree.range_query(prefix, &(prefix.to_string() + "\u{10FFFF}"))?;
for (key, _offset) in range_results.iter().take(limit) {
if key.starts_with(prefix) {
match self.get(&key.to_string()) {
Ok(entry) => {
results.push(SearchResult {
word: key.to_string(),
entry,
score: None,
highlights: None,
});
}
Err(_) => continue,
}
}
}
} else {
}
Ok(results)
}
fn search_fuzzy(&self, query: &str, _max_distance: Option<u32>) -> Result<Vec<SearchResult>> {
self.search_prefix(query, None)
}
fn search_fulltext(
&self,
query: &str,
) -> Result<Box<dyn Iterator<Item = Result<SearchResult>> + Send>> {
if let Some(ref fts) = self.fts_index {
let search_results = fts.search(query)?;
let mut items: Vec<Result<SearchResult>> = Vec::with_capacity(search_results.len());
for (key, score) in search_results {
match self.get(&key) {
Ok(entry) => {
items.push(Ok(SearchResult {
word: key,
entry,
score: Some(score),
highlights: None,
}));
}
Err(e) => {
items.push(Err(e));
}
}
}
Ok(Box::new(items.into_iter()))
} else {
Err(DictError::UnsupportedOperation(
"FTS index not available".to_string(),
))
}
}
fn get_range(&self, range: std::ops::Range<usize>) -> Result<Vec<(String, Vec<u8>)>> {
if range.is_empty() {
return Ok(Vec::new());
}
if let Some(ref btree) = self.btree_index {
let all = btree.range_query("", "\u{10FFFF}")?;
let slice = if range.start >= all.len() {
&[]
} else {
&all[range.start.min(all.len())..range.end.min(all.len())]
};
let mut out = Vec::with_capacity(slice.len());
for (key, _off) in slice {
if let Ok(val) = self.get(&key) {
out.push((key.clone(), val));
}
}
return Ok(out);
}
Err(DictError::UnsupportedOperation(
"MDict get_range requires a loaded B-TREE index".to_string(),
))
}
fn iter(&self) -> Result<EntryIterator<String>> {
let keys = self.keys()?;
Ok(EntryIterator {
keys: keys.into_iter(),
dictionary: self,
})
}
fn prefix_iter(
&self,
prefix: &str,
) -> Result<Box<dyn Iterator<Item = Result<(String, Vec<u8>)>> + Send>> {
let results = self.search_prefix(prefix, None)?;
let mapped: Vec<_> = results
.into_iter()
.map(|sr| Ok((sr.word, sr.entry)))
.collect();
Ok(Box::new(mapped.into_iter()))
}
fn len(&self) -> usize {
self.header.word_count as usize
}
fn file_paths(&self) -> &[std::path::PathBuf] {
std::slice::from_ref(&self.file_path)
}
fn reload_indexes(&mut self) -> Result<()> {
let config = self.config.clone();
let (btree_index, fts_index) = Self::load_indexes(&self.file_path, &config, &self.header)?;
self.btree_index = btree_index;
self.fts_index = fts_index;
Ok(())
}
fn clear_cache(&mut self) {
let mut cache = self.entry_cache.write();
cache.clear();
}
fn stats(&self) -> DictStats {
DictStats {
total_entries: self.len() as u64,
cache_hit_rate: 0.0,
memory_usage: self.header.file_size,
index_sizes: HashMap::new(),
}
}
fn build_indexes(&mut self) -> Result<()> {
MDict::build_indexes(self)
}
}
pub fn convert_entry_data_if_needed(data: &[u8]) -> Result<Vec<u8>> {
if std::str::from_utf8(data).is_ok() {
return Ok(data.to_vec());
}
let detected_encoding = encoding::detect_encoding(data)?;
let utf8_string = encoding::convert_to_utf8(data, detected_encoding)?;
Ok(utf8_string.into_bytes())
}
fn parse_mdict_header_attributes(header: &str) -> HashMap<String, String> {
let mut attrs = HashMap::new();
let start = match header.find('<') {
Some(s) => s,
None => return attrs,
};
let end = match header[start..].find('>') {
Some(e) => start + e,
None => return attrs,
};
let elem = &header[start + 1..end];
let mut iter = elem.split_whitespace();
iter.next();
for token in iter {
let mut parts = token.splitn(2, '=');
let key = match parts.next() {
Some(k) if !k.is_empty() => k,
_ => continue,
};
let val_raw = match parts.next() {
Some(v) => v.trim(),
None => continue,
};
let mut v = val_raw.trim_matches(|c| c == '"' || c == '\'');
if let Some(stripped) = v.strip_suffix('>') {
v = stripped.trim();
}
attrs.insert(key.to_string(), v.to_string());
}
attrs
}
fn strip_html_like(input: &str) -> String {
let mut out = String::with_capacity(input.len());
let mut in_tag = false;
for c in input.chars() {
match c {
'<' => in_tag = true,
'>' => in_tag = false,
_ if !in_tag => out.push(c),
_ => {}
}
}
out.trim().to_string()
}
impl HighPerformanceDict<String> for MDict {
fn binary_search_get(&self, key: &String) -> Result<Vec<u8>> {
self.get(key)
}
fn stream_search(
&self,
_query: &str,
) -> Result<Box<dyn Iterator<Item = Result<SearchResult>>>> {
Err(DictError::UnsupportedOperation(
"Stream search not implemented".to_string(),
))
}
}