1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
use std::{str::FromStr, u32};

use log::warn;
use serde::{Deserialize, Serialize};

use crate::{
    character_definition::CategoryId,
    error::LinderaErrorKind,
    word_entry::{WordEntry, WordId},
    LinderaResult,
};

#[derive(Serialize, Deserialize, Clone)]
pub struct UnknownDictionary {
    pub category_references: Vec<Vec<u32>>,
    pub costs: Vec<WordEntry>,
}

impl UnknownDictionary {
    pub fn load(unknown_data: &[u8]) -> LinderaResult<UnknownDictionary> {
        bincode::deserialize(unknown_data)
            .map_err(|err| LinderaErrorKind::Deserialize.with_error(anyhow::anyhow!(err)))
    }

    pub fn word_entry(&self, word_id: u32) -> WordEntry {
        self.costs[word_id as usize]
    }

    pub fn lookup_word_ids(&self, category_id: CategoryId) -> &[u32] {
        &self.category_references[category_id.0][..]
    }
}

#[derive(Debug)]
pub struct UnknownDictionaryEntry {
    pub surface: String,
    pub left_id: u32,
    pub right_id: u32,
    pub word_cost: i32,
}

fn parse_dictionary_entry(
    fields: &[&str],
    expected_fields_len: usize,
) -> LinderaResult<UnknownDictionaryEntry> {
    if fields.len() != expected_fields_len {
        return Err(LinderaErrorKind::Content.with_error(anyhow::anyhow!(
            "Invalid number of fields. Expect {}, got {}",
            expected_fields_len,
            fields.len()
        )));
    }
    let surface = fields[0];
    let left_id = u32::from_str(fields[1])
        .map_err(|err| LinderaErrorKind::Parse.with_error(anyhow::anyhow!(err)))?;
    let right_id = u32::from_str(fields[2])
        .map_err(|err| LinderaErrorKind::Parse.with_error(anyhow::anyhow!(err)))?;
    let word_cost = i32::from_str(fields[3])
        .map_err(|err| LinderaErrorKind::Parse.with_error(anyhow::anyhow!(err)))?;

    Ok(UnknownDictionaryEntry {
        surface: surface.to_string(),
        left_id,
        right_id,
        word_cost,
    })
}

fn get_entry_id_matching_surface(
    entries: &[UnknownDictionaryEntry],
    target_surface: &str,
) -> Vec<u32> {
    entries
        .iter()
        .enumerate()
        .filter_map(|(entry_id, entry)| {
            if entry.surface == *target_surface {
                Some(entry_id as u32)
            } else {
                None
            }
        })
        .collect()
}

fn make_category_references(
    categories: &[String],
    entries: &[UnknownDictionaryEntry],
) -> Vec<Vec<u32>> {
    categories
        .iter()
        .map(|category| get_entry_id_matching_surface(entries, category))
        .collect()
}

fn make_costs_array(entries: &[UnknownDictionaryEntry]) -> Vec<WordEntry> {
    entries
        .iter()
        .map(|e| {
            // Do not perform strict checks on left context id and right context id in unk.def.
            // Just output a warning.
            if e.left_id != e.right_id {
                warn!("left id and right id are not same: {:?}", e);
            }
            WordEntry {
                word_id: WordId(std::u32::MAX, true),
                left_id: e.left_id as u16,
                right_id: e.right_id as u16,
                word_cost: e.word_cost as i16,
            }
        })
        .collect()
}

pub fn parse_unk(
    categories: &[String],
    file_content: &str,
    expected_fields_len: usize,
) -> LinderaResult<UnknownDictionary> {
    let mut unknown_dict_entries = Vec::new();
    for line in file_content.lines() {
        let fields: Vec<&str> = line.split(',').collect::<Vec<&str>>();
        let entry = parse_dictionary_entry(&fields[..], expected_fields_len)?;
        unknown_dict_entries.push(entry);
    }

    let category_references = make_category_references(categories, &unknown_dict_entries[..]);
    let costs = make_costs_array(&unknown_dict_entries[..]);
    Ok(UnknownDictionary {
        category_references,
        costs,
    })
}

#[cfg(test)]
mod tests {}