use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DictionaryMetadata {
pub version: u32,
pub language: String,
pub word_count: usize,
pub total_tokens: u64,
pub min_frequency: u64,
pub created_at: u64,
pub source: Option<String>,
}
impl Default for DictionaryMetadata {
fn default() -> Self {
Self {
version: 1,
language: "en".to_string(),
word_count: 0,
total_tokens: 0,
min_frequency: 1,
created_at: 0,
source: None,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WordEntry {
pub word: String,
pub frequency: u64,
pub log_prob: Option<f64>,
}
impl WordEntry {
pub fn new(word: String, frequency: u64) -> Self {
Self {
word,
frequency,
log_prob: None,
}
}
pub fn with_log_prob(word: String, frequency: u64, log_prob: f64) -> Self {
Self {
word,
frequency,
log_prob: Some(log_prob),
}
}
}
#[derive(Debug, Clone, Default)]
pub struct DictionaryStats {
pub total_words: usize,
pub words_kept: usize,
pub words_filtered: usize,
pub total_tokens: u64,
pub sentences_processed: usize,
}
impl std::fmt::Display for DictionaryStats {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
writeln!(f, "Dictionary Extraction Statistics:")?;
writeln!(f, " Sentences processed: {}", self.sentences_processed)?;
writeln!(f, " Total tokens: {}", self.total_tokens)?;
writeln!(f, " Unique words: {}", self.total_words)?;
writeln!(f, " Words kept: {}", self.words_kept)?;
writeln!(f, " Words filtered: {}", self.words_filtered)?;
if self.total_words > 0 {
let keep_rate = 100.0 * self.words_kept as f64 / self.total_words as f64;
writeln!(f, " Keep rate: {:.1}%", keep_rate)?;
}
Ok(())
}
}