1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
use std::io::{self, BufReader};
use std::fs::{self, File};
use std::path::Path;
use trie::{Searcher};
use dictionary;
use dictionary::ViterbiNode;
use util::*;
use {Utf16Char, Utf16String};


pub struct WordDic {
    trie: Searcher,
    data: String,
    indices: Box<[i32]>,

    /// costs[単語ID] = 単語のコスト
    costs: Box<[i16]>,
    /// left_ids[単語ID] = 単語の左文脈ID
    left_ids: Box<[i16]>,
    /// right_ids[単語ID] = 単語の右文脈ID
    right_ids: Box<[i16]>,
    /// data_offsets[単語ID] = 単語の素性データの開始位置
    data_offsets: Box<[i32]>
}

impl WordDic {
    pub fn new(data_dir: &Path) -> io::Result<WordDic> {
        let word2id_path = data_dir.join("word2id");
        let dat_path = data_dir.join("word.dat");
        let idx_path = data_dir.join("word.ary.idx");
        let inf_path = data_dir.join("word.inf");

        let metadata = fs::metadata(&inf_path)?;
        let mut reader = BufReader::new(File::open(&inf_path)?);
        let word_count = (metadata.len() / (4 + 2 + 2 + 2)) as usize;
        debug!("word_count: {}", word_count);

        let word_data = read_all_as_chars(&dat_path)?;
        let data_offsets = reader.get_int_array(word_count)?;
        let (word_data, data_offsets) = convert2utf8_data(&word_data, &data_offsets);

        Ok(WordDic {
            trie: Searcher::new(&word2id_path)?,
            data: word_data,
            indices: read_all_as_int_array(&idx_path)?,

            data_offsets: data_offsets,
            left_ids: reader.get_short_array(word_count)?,
            right_ids: reader.get_short_array(word_count)?,
            costs: reader.get_short_array(word_count)?
        })
    }

    pub fn search(&self, text: &Utf16String, start: usize, callback: &mut dictionary::Callback) {
        self.trie.each_common_prefix(text, start, |start: usize, offset: i32, trie_id: i32| {
            /*
             * common-prefix検索でキーが見つかった場合に呼び出されるクロージャー
             * each_common_prefix()で該当するキーの部分文字列が見つかった都度に呼び出される
             *
             * @param start  入力テキストの検索開始位置
             * @param offset 一致した部分文字列の終端位置
             * @param trie_id 一致した部分文字列のID
             */
            let trie_id = trie_id as usize;
            let end: i32 = self.indices[trie_id + 1];

            for i in self.indices[trie_id]..end {
                let idx = i as usize;
                callback.call(ViterbiNode {
                    word_id: i,
                    start: start,
                    length: offset as i16,
                    cost: self.costs[idx] as i32,
                    left_id: self.left_ids[idx],
                    right_id: self.right_ids[idx],
                    is_space: false,
                    prev: None
                });
            }
        });
    }

    pub fn search_from_trie_id(&self, trie_id: i32, start: usize, word_length: usize,
                               is_space: bool, callback: &mut dictionary::Callback) {
        let trie_id = trie_id as usize;
        let end = self.indices[trie_id + 1];
        for i in self.indices[trie_id]..end {
            let idx = i as usize;
            callback.call(ViterbiNode {
                word_id: i,
                start: start,
                length: word_length as i16,
                cost: self.costs[idx] as i32,
                left_id: self.left_ids[idx],
                right_id: self.right_ids[idx],
                is_space: is_space,
                prev: None
            });
        }
    }

    pub fn word_data(&self, word_id: i32) -> &str {
        let word_id = word_id as usize;
        &self.data[
            (self.data_offsets[word_id] as usize) .. (self.data_offsets[word_id + 1] as usize)]
    }
}

// word_data()用に、予めString型へ変換しておく
fn convert2utf8_data(utf16_str: &[Utf16Char], offsets: &[i32]) -> (String, Box<[i32]>) {
    let mut buf = String::with_capacity(utf16_str.len() * 3);
    let mut new_offset = vec![0i32; offsets.len()];

    for word_id in 0..(offsets.len() - 1) {
        let offset = offsets[word_id] as usize;
        let next_offset = offsets[word_id + 1] as usize;
        let word_data = String::from_utf16_lossy(&utf16_str[offset..next_offset]);
        buf.push_str(&word_data);
        new_offset[word_id + 1] = buf.len() as i32;
    }
    debug!("buf size: {} / {}", buf.len(), buf.capacity());

    (buf, new_offset.into_boxed_slice())
}