lindera_ipadic_builder/
ipadic_builder.rs1use std::{fs, path::Path};
2
3use lindera_core::{
4 character_definition::CharacterDefinitions, dictionary::UserDictionary,
5 dictionary_builder::DictionaryBuilder, error::LinderaErrorKind, LinderaResult,
6};
7use lindera_decompress::Algorithm;
8use lindera_dictionary_builder::{
9 build_user_dictionary, CharDefBuilderOptions, CostMatrixBuilderOptions, DictBuilderOptions,
10 UnkBuilderOptions, UserDictBuilderOptions,
11};
12
13const SIMPLE_USERDIC_FIELDS_NUM: usize = 3;
14const SIMPLE_WORD_COST: i16 = -10000;
15const SIMPLE_CONTEXT_ID: u16 = 0;
16const DETAILED_USERDIC_FIELDS_NUM: usize = 13;
17const COMPRESS_ALGORITHM: Algorithm = Algorithm::Deflate;
18const UNK_FIELDS_NUM: usize = 11;
19const ENCODING: &'static str = "EUC-JP";
20
21pub struct IpadicBuilder {}
22
23impl IpadicBuilder {
24 pub fn new() -> Self {
25 IpadicBuilder {}
26 }
27}
28
29impl Default for IpadicBuilder {
30 fn default() -> Self {
31 Self::new()
32 }
33}
34
35impl DictionaryBuilder for IpadicBuilder {
36 fn build_dictionary(&self, input_dir: &Path, output_dir: &Path) -> LinderaResult<()> {
37 fs::create_dir_all(output_dir)
38 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
39
40 let chardef = self.build_chardef(input_dir, output_dir)?;
41 self.build_unk(input_dir, &chardef, output_dir)?;
42 self.build_dict(input_dir, output_dir)?;
43 self.build_cost_matrix(input_dir, output_dir)?;
44
45 Ok(())
46 }
47
48 fn build_user_dictionary(&self, input_file: &Path, output_file: &Path) -> LinderaResult<()> {
49 let user_dict = self.build_user_dict(input_file)?;
50 build_user_dictionary(user_dict, output_file)
51 }
52
53 fn build_chardef(
54 &self,
55 input_dir: &Path,
56 output_dir: &Path,
57 ) -> LinderaResult<CharacterDefinitions> {
58 CharDefBuilderOptions::default()
59 .encoding(ENCODING)
60 .compress_algorithm(COMPRESS_ALGORITHM)
61 .builder()
62 .unwrap()
63 .build(input_dir, output_dir)
64 }
65
66 fn build_unk(
67 &self,
68 input_dir: &Path,
69 chardef: &CharacterDefinitions,
70 output_dir: &Path,
71 ) -> LinderaResult<()> {
72 UnkBuilderOptions::default()
73 .encoding(ENCODING)
74 .compress_algorithm(COMPRESS_ALGORITHM)
75 .unk_fields_num(UNK_FIELDS_NUM)
76 .builder()
77 .unwrap()
78 .build(input_dir, chardef, output_dir)
79 }
80
81 fn build_dict(&self, input_dir: &Path, output_dir: &Path) -> LinderaResult<()> {
82 DictBuilderOptions::default()
83 .flexible_csv(false)
84 .encoding(ENCODING)
85 .compress_algorithm(COMPRESS_ALGORITHM)
86 .normalize_details(true)
87 .builder()
88 .unwrap()
89 .build(input_dir, output_dir)
90 }
91
92 fn build_cost_matrix(&self, input_dir: &Path, output_dir: &Path) -> LinderaResult<()> {
93 CostMatrixBuilderOptions::default()
94 .encoding(ENCODING)
95 .compress_algorithm(COMPRESS_ALGORITHM)
96 .builder()
97 .unwrap()
98 .build(&input_dir, output_dir)
99 }
100
101 fn build_user_dict(&self, input_file: &Path) -> LinderaResult<UserDictionary> {
102 UserDictBuilderOptions::default()
103 .simple_userdic_fields_num(SIMPLE_USERDIC_FIELDS_NUM)
104 .detailed_userdic_fields_num(DETAILED_USERDIC_FIELDS_NUM)
105 .simple_word_cost(SIMPLE_WORD_COST)
106 .simple_context_id(SIMPLE_CONTEXT_ID)
107 .flexible_csv(true)
108 .simple_userdic_details_handler(Box::new(|row| {
109 Ok(vec![
110 row[1].to_string(), "*".to_string(), "*".to_string(), "*".to_string(), "*".to_string(), "*".to_string(), row[0].to_string(), row[2].to_string(), "*".to_string(), ])
120 }))
121 .builder()
122 .unwrap()
123 .build(input_file)
124 }
125}