lindera/
mode.rs

1//! Tokenization modes and penalty configurations.
2//!
3//! This module defines the different tokenization modes available and their
4//! penalty configurations for controlling segmentation behavior.
5//!
6//! # Modes
7//!
8//! - **Normal**: Standard tokenization based on dictionary cost
9//! - **Decompose**: Decomposes compound words with penalty-based control
10//!
11//! # Examples
12//!
13//! ```python
14//! # Normal mode
15//! tokenizer = lindera.TokenizerBuilder().set_mode("normal").build()
16//!
17//! # Decompose mode
18//! tokenizer = lindera.TokenizerBuilder().set_mode("decompose").build()
19//!
20//! # Custom penalty configuration
21//! penalty = lindera.Penalty(
22//!     kanji_penalty_length_threshold=2,
23//!     kanji_penalty_length_penalty=3000
24//! )
25//! ```
26
27use pyo3::prelude::*;
28
29use lindera::mode::{Mode as LinderaMode, Penalty as LinderaPenalty};
30
31/// Tokenization mode.
32///
33/// Determines how text is segmented into tokens.
34#[pyclass(name = "Mode")]
35#[derive(Debug, Clone, Copy)]
36pub enum PyMode {
37    /// Standard tokenization based on dictionary cost
38    Normal,
39    /// Decompose compound words using penalty-based segmentation
40    Decompose,
41}
42
43#[pymethods]
44impl PyMode {
45    #[new]
46    #[pyo3(signature = (mode_str=None))]
47    pub fn new(mode_str: Option<&str>) -> PyResult<Self> {
48        match mode_str {
49            Some("decompose") | Some("Decompose") => Ok(PyMode::Decompose),
50            Some("normal") | Some("Normal") | None => Ok(PyMode::Normal),
51            Some(s) => Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(format!(
52                "Invalid mode: {s}. Must be 'normal' or 'decompose'"
53            ))),
54        }
55    }
56
57    fn __str__(&self) -> &str {
58        match self {
59            PyMode::Normal => "normal",
60            PyMode::Decompose => "decompose",
61        }
62    }
63
64    fn __repr__(&self) -> String {
65        format!("Mode.{self:?}")
66    }
67
68    #[getter]
69    pub fn name(&self) -> &str {
70        self.__str__()
71    }
72
73    pub fn is_normal(&self) -> bool {
74        matches!(self, PyMode::Normal)
75    }
76
77    pub fn is_decompose(&self) -> bool {
78        matches!(self, PyMode::Decompose)
79    }
80}
81
82impl From<PyMode> for LinderaMode {
83    fn from(mode: PyMode) -> Self {
84        match mode {
85            PyMode::Normal => LinderaMode::Normal,
86            PyMode::Decompose => LinderaMode::Decompose(LinderaPenalty::default()),
87        }
88    }
89}
90
91impl From<LinderaMode> for PyMode {
92    fn from(mode: LinderaMode) -> Self {
93        match mode {
94            LinderaMode::Normal => PyMode::Normal,
95            LinderaMode::Decompose(_) => PyMode::Decompose,
96        }
97    }
98}
99
100/// Penalty configuration for decompose mode.
101///
102/// Controls how aggressively compound words are decomposed based on
103/// character type and length thresholds.
104///
105/// # Examples
106///
107/// ```python
108/// penalty = lindera.Penalty(
109///     kanji_penalty_length_threshold=2,
110///     kanji_penalty_length_penalty=3000,
111///     other_penalty_length_threshold=7,
112///     other_penalty_length_penalty=1700
113/// )
114/// ```
115#[pyclass(name = "Penalty")]
116#[derive(Debug, Clone, Copy)]
117pub struct PyPenalty {
118    kanji_penalty_length_threshold: usize,
119    kanji_penalty_length_penalty: i32,
120    other_penalty_length_threshold: usize,
121    other_penalty_length_penalty: i32,
122}
123
124#[pymethods]
125impl PyPenalty {
126    #[new]
127    #[pyo3(signature = (kanji_penalty_length_threshold=None, kanji_penalty_length_penalty=None, other_penalty_length_threshold=None, other_penalty_length_penalty=None))]
128    pub fn new(
129        kanji_penalty_length_threshold: Option<usize>,
130        kanji_penalty_length_penalty: Option<i32>,
131        other_penalty_length_threshold: Option<usize>,
132        other_penalty_length_penalty: Option<i32>,
133    ) -> Self {
134        PyPenalty {
135            kanji_penalty_length_threshold: kanji_penalty_length_threshold.unwrap_or(2),
136            kanji_penalty_length_penalty: kanji_penalty_length_penalty.unwrap_or(3000),
137            other_penalty_length_threshold: other_penalty_length_threshold.unwrap_or(7),
138            other_penalty_length_penalty: other_penalty_length_penalty.unwrap_or(1700),
139        }
140    }
141
142    #[getter]
143    pub fn get_kanji_penalty_length_threshold(&self) -> usize {
144        self.kanji_penalty_length_threshold
145    }
146
147    #[setter]
148    pub fn set_kanji_penalty_length_threshold(&mut self, value: usize) {
149        self.kanji_penalty_length_threshold = value;
150    }
151
152    #[getter]
153    pub fn get_kanji_penalty_length_penalty(&self) -> i32 {
154        self.kanji_penalty_length_penalty
155    }
156
157    #[setter]
158    pub fn set_kanji_penalty_length_penalty(&mut self, value: i32) {
159        self.kanji_penalty_length_penalty = value;
160    }
161
162    #[getter]
163    pub fn get_other_penalty_length_threshold(&self) -> usize {
164        self.other_penalty_length_threshold
165    }
166
167    #[setter]
168    pub fn set_other_penalty_length_threshold(&mut self, value: usize) {
169        self.other_penalty_length_threshold = value;
170    }
171
172    #[getter]
173    pub fn get_other_penalty_length_penalty(&self) -> i32 {
174        self.other_penalty_length_penalty
175    }
176
177    #[setter]
178    pub fn set_other_penalty_length_penalty(&mut self, value: i32) {
179        self.other_penalty_length_penalty = value;
180    }
181
182    fn __str__(&self) -> String {
183        format!(
184            "Penalty(kanji_threshold={}, kanji_penalty={}, other_threshold={}, other_penalty={})",
185            self.kanji_penalty_length_threshold,
186            self.kanji_penalty_length_penalty,
187            self.other_penalty_length_threshold,
188            self.other_penalty_length_penalty
189        )
190    }
191
192    fn __repr__(&self) -> String {
193        self.__str__()
194    }
195}
196
197impl From<PyPenalty> for LinderaPenalty {
198    fn from(penalty: PyPenalty) -> Self {
199        LinderaPenalty {
200            kanji_penalty_length_threshold: penalty.kanji_penalty_length_threshold,
201            kanji_penalty_length_penalty: penalty.kanji_penalty_length_penalty,
202            other_penalty_length_threshold: penalty.other_penalty_length_threshold,
203            other_penalty_length_penalty: penalty.other_penalty_length_penalty,
204        }
205    }
206}
207
208impl From<LinderaPenalty> for PyPenalty {
209    fn from(penalty: LinderaPenalty) -> Self {
210        PyPenalty {
211            kanji_penalty_length_threshold: penalty.kanji_penalty_length_threshold,
212            kanji_penalty_length_penalty: penalty.kanji_penalty_length_penalty,
213            other_penalty_length_threshold: penalty.other_penalty_length_threshold,
214            other_penalty_length_penalty: penalty.other_penalty_length_penalty,
215        }
216    }
217}