Skip to main content

lindera/
token.rs

1use pyo3::prelude::*;
2
3use lindera::token::Token;
4
5/// Token object wrapping the Rust Token data.
6///
7/// This class provides robust access to token field and details.
8#[pyclass(name = "Token")]
9pub struct PyToken {
10    /// Surface form of the token.
11    #[pyo3(get)]
12    pub surface: String,
13
14    /// Start byte position in the original text.
15    #[pyo3(get)]
16    pub byte_start: usize,
17
18    /// End byte position in the original text.
19    #[pyo3(get)]
20    pub byte_end: usize,
21
22    /// Position index of the token.
23    #[pyo3(get)]
24    pub position: usize,
25
26    /// Word ID in the dictionary.
27    #[pyo3(get)]
28    pub word_id: u32,
29
30    /// Morphological details of the token.
31    #[pyo3(get)]
32    pub details: Option<Vec<String>>,
33}
34
35#[pymethods]
36impl PyToken {
37    /// Returns the detail at the specified index.
38    ///
39    /// # Arguments
40    ///
41    /// * `index` - Index of the detail to retrieve.
42    ///
43    /// # Returns
44    ///
45    /// The detail string if found, otherwise None.
46    #[pyo3(signature = (index))]
47    fn get_detail(&self, index: usize) -> Option<String> {
48        self.details.as_ref().and_then(|d| d.get(index).cloned())
49    }
50
51    /// Returns a string representation of the token.
52    fn __repr__(&self) -> String {
53        format!(
54            "<Token surface='{}', start={}, end={}, position={}, word_id={}>",
55            self.surface, self.byte_start, self.byte_end, self.position, self.word_id
56        )
57    }
58}
59
60impl PyToken {
61    pub fn from_token(mut token: Token) -> Self {
62        let details = token.details().iter().map(|s| s.to_string()).collect();
63        // Since lindera::token::Token.details() returns Vec<&str>, we convert to Vec<String>.
64        // Wait, Token.details() actually calls ensure_details() which loads from dictionary.
65
66        Self {
67            surface: token.surface.to_string(),
68            byte_start: token.byte_start,
69            byte_end: token.byte_end,
70            position: token.position,
71            word_id: token.word_id.id,
72            details: Some(details),
73        }
74    }
75}
76
77pub fn register(parent_module: &Bound<'_, PyModule>) -> PyResult<()> {
78    let py = parent_module.py();
79    let m = PyModule::new(py, "token")?;
80    m.add_class::<PyToken>()?;
81    parent_module.add_submodule(&m)?;
82    Ok(())
83}