import tempfile
from pathlib import Path
from lindera import export, train
def create_training_data(tmpdir: Path):
seed_file = tmpdir / "seed.csv"
seed_file.write_text(
"外国,0,0,0,名詞,一般,*,*,*,*,外国,ガイコク,ガイコク\n"
"人,0,0,0,名詞,接尾,一般,*,*,*,人,ジン,ジン\n"
"参政,0,0,0,名詞,サ変接続,*,*,*,*,参政,サンセイ,サンセイ\n"
"権,0,0,0,名詞,接尾,一般,*,*,*,権,ケン,ケン\n"
"これ,0,0,0,名詞,代名詞,一般,*,*,*,これ,コレ,コレ\n"
"は,0,0,0,助詞,係助詞,*,*,*,*,は,ハ,ワ\n"
"テスト,0,0,0,名詞,サ変接続,*,*,*,*,テスト,テスト,テスト\n"
"です,0,0,0,助動詞,*,*,*,特殊・デス,基本形,です,デス,デス\n"
"。,0,0,0,記号,句点,*,*,*,*,。,。,。\n"
"形態,0,0,0,名詞,一般,*,*,*,*,形態,ケイタイ,ケイタイ\n"
"素,0,0,0,名詞,接尾,一般,*,*,*,素,ソ,ソ\n"
"解析,0,0,0,名詞,サ変接続,*,*,*,*,解析,カイセキ,カイセキ\n"
"を,0,0,0,助詞,格助詞,一般,*,*,*,を,ヲ,ヲ\n"
"行う,0,0,0,動詞,自立,*,*,五段・ワ行促音便,基本形,行う,オコナウ,オコナウ\n"
)
char_def_file = tmpdir / "char.def"
char_def_file.write_text(
"# Character definition for training\n"
"DEFAULT 0 1 0\n"
"HIRAGANA 1 1 0\n"
"KATAKANA 1 1 0\n"
"KANJI 0 0 2\n"
"ALPHA 1 1 0\n"
"NUMERIC 1 1 0\n"
"\n"
"# Character mappings (simplified)\n"
"0x3041..0x3096 HIRAGANA\n"
"0x30A1..0x30F6 KATAKANA\n"
"0x4E00..0x9FAF KANJI\n"
"0x0030..0x0039 NUMERIC\n"
"0x0041..0x005A ALPHA\n"
"0x0061..0x007A ALPHA\n"
)
unk_def_file = tmpdir / "unk.def"
unk_def_file.write_text(
"# Unknown word definitions\n"
"DEFAULT,0,0,0,名詞,一般,*,*,*,*,*,*,*\n"
"HIRAGANA,0,0,0,名詞,一般,*,*,*,*,*,*,*\n"
"KATAKANA,0,0,0,名詞,一般,*,*,*,*,*,*,*\n"
"KANJI,0,0,0,名詞,一般,*,*,*,*,*,*,*\n"
"ALPHA,0,0,0,名詞,固有名詞,一般,*,*,*,*,*,*\n"
"NUMERIC,0,0,0,名詞,数,*,*,*,*,*,*,*\n"
)
feature_def_file = tmpdir / "feature.def"
feature_def_file.write_text(
"# Feature template definitions for training\n"
"# These define how features are extracted from the morphological data\n"
"\n"
"# Unigram features (word-level features)\n"
"UNIGRAM U00:%F[0] # Part of speech\n"
"UNIGRAM U01:%F[0],%F?[1] # POS + sub-category\n"
"UNIGRAM U02:%F[0],%F[1],%F?[2] # POS hierarchy\n"
"\n"
"# Bigram features (transition features between words)\n"
"# Format: BIGRAM label:%L[index]/%R[index]\n"
"# %L = left context (previous word), %R = right context (next word)\n"
"BIGRAM B00:%L[0]/%R[0] # POS-to-POS transition\n"
"BIGRAM B01:%L[0],%L?[1]/%R[0] # Left POS hierarchy to right POS\n"
"BIGRAM B02:%L[0]/%R[0],%R?[1] # Left POS to right POS hierarchy\n"
"BIGRAM B03:%L[0],%L[1],%L?[2]/%R[0] # Detailed left to simple right\n"
)
rewrite_def_file = tmpdir / "rewrite.def"
rewrite_def_file.write_text(
"# Rewrite rules for feature normalization\n"
"# Format: original_pattern\treplacement_pattern\n"
"\n"
'# Test rewrite: convert "名詞,一般" to "NOUN,GENERAL"\n'
"名詞,一般\tNOUN,GENERAL\n"
"\n"
'# Test rewrite: convert "助詞,係助詞" to "PARTICLE,KAKUJOSHI"\n'
"助詞,係助詞\tPARTICLE,KAKUJOSHI\n"
"\n"
"# Normalize numeric expressions\n"
"数\tNUM\n"
)
corpus_file = tmpdir / "corpus.txt"
corpus_file.write_text(
"外国\t名詞,一般,*,*,*,*,外国,ガイコク,ガイコク\n"
"人\t名詞,接尾,一般,*,*,*,人,ジン,ジン\n"
"参政\t名詞,サ変接続,*,*,*,*,参政,サンセイ,サンセイ\n"
"権\t名詞,接尾,一般,*,*,*,権,ケン,ケン\n"
"EOS\n"
"\n"
"これ\t名詞,代名詞,一般,*,*,*,これ,コレ,コレ\n"
"は\t助詞,係助詞,*,*,*,*,は,ハ,ワ\n"
"テスト\t名詞,サ変接続,*,*,*,*,テスト,テスト,テスト\n"
"です\t助動詞,*,*,*,特殊・デス,基本形,です,デス,デス\n"
"。\t記号,句点,*,*,*,*,。,。,。\n"
"EOS\n"
"\n"
"形態\t名詞,一般,*,*,*,*,形態,ケイタイ,ケイタイ\n"
"素\t名詞,接尾,一般,*,*,*,素,ソ,ソ\n"
"解析\t名詞,サ変接続,*,*,*,*,解析,カイセキ,カイセキ\n"
"を\t助詞,格助詞,一般,*,*,*,を,ヲ,ヲ\n"
"行う\t動詞,自立,*,*,五段・ワ行促音便,基本形,行う,オコナウ,オコナウ\n"
"EOS\n"
)
metadata_file = tmpdir / "metadata.json"
metadata_file.write_text(
'{\n "name": "custom-dict",\n "version": "1.0.0",\n "encoding": "utf-8"\n}\n'
)
return {
"seed": seed_file,
"char_def": char_def_file,
"unk_def": unk_def_file,
"feature_def": feature_def_file,
"rewrite_def": rewrite_def_file,
"corpus": corpus_file,
"metadata": metadata_file,
}
def main():
print("=== Lindera Training and Export Example ===\n")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
print(f"Working directory: {tmpdir}\n")
print("Step 1: Creating training data...")
files = create_training_data(tmpdir)
print("✓ Training data created\n")
print("Step 2: Training model...")
model_file = tmpdir / "model.dat"
train(
seed=str(files["seed"]),
corpus=str(files["corpus"]),
char_def=str(files["char_def"]),
unk_def=str(files["unk_def"]),
feature_def=str(files["feature_def"]),
rewrite_def=str(files["rewrite_def"]),
output=str(model_file),
lambda_=0.01, max_iter=10, max_threads=None, )
print(f"✓ Model trained and saved to: {model_file}\n")
print("Step 3: Exporting dictionary files...")
export_dir = tmpdir / "exported_dict"
export(
model=str(model_file),
output=str(export_dir),
metadata=str(files["metadata"]),
)
print(f"✓ Dictionary files exported to: {export_dir}\n")
print("Step 4: Exported files:")
exported_files = sorted(export_dir.glob("*"))
for file in exported_files:
size = file.stat().st_size
print(f" - {file.name} ({size:,} bytes)")
print("\n✓ Training and export completed successfully!")
if __name__ == "__main__":
main()