Skip to main content

lindera/
token.rs

1use pyo3::prelude::*;
2
3use lindera::token::Token;
4
5/// Token object wrapping the Rust Token data.
6///
7/// This class provides robust access to token field and details.
8#[pyclass(name = "Token")]
9pub struct PyToken {
10    /// Surface form of the token.
11    #[pyo3(get)]
12    pub surface: String,
13
14    /// Start byte position in the original text.
15    #[pyo3(get)]
16    pub byte_start: usize,
17
18    /// End byte position in the original text.
19    #[pyo3(get)]
20    pub byte_end: usize,
21
22    /// Position index of the token.
23    #[pyo3(get)]
24    pub position: usize,
25
26    /// Word ID in the dictionary.
27    #[pyo3(get)]
28    pub word_id: u32,
29
30    /// Whether this token is an unknown word (not found in the dictionary).
31    #[pyo3(get)]
32    pub is_unknown: bool,
33
34    /// Morphological details of the token.
35    #[pyo3(get)]
36    pub details: Option<Vec<String>>,
37}
38
39#[pymethods]
40impl PyToken {
41    /// Returns the detail at the specified index.
42    ///
43    /// # Arguments
44    ///
45    /// * `index` - Index of the detail to retrieve.
46    ///
47    /// # Returns
48    ///
49    /// The detail string if found, otherwise None.
50    #[pyo3(signature = (index))]
51    fn get_detail(&self, index: usize) -> Option<String> {
52        self.details.as_ref().and_then(|d| d.get(index).cloned())
53    }
54
55    /// Returns a string representation of the token.
56    fn __repr__(&self) -> String {
57        format!(
58            "<Token surface='{}', start={}, end={}, position={}, word_id={}, is_unknown={}>",
59            self.surface,
60            self.byte_start,
61            self.byte_end,
62            self.position,
63            self.word_id,
64            self.is_unknown
65        )
66    }
67}
68
69impl PyToken {
70    pub fn from_token(mut token: Token) -> Self {
71        let details = token.details().iter().map(|s| s.to_string()).collect();
72        // Since lindera::token::Token.details() returns Vec<&str>, we convert to Vec<String>.
73        // Wait, Token.details() actually calls ensure_details() which loads from dictionary.
74
75        Self {
76            surface: token.surface.to_string(),
77            byte_start: token.byte_start,
78            byte_end: token.byte_end,
79            position: token.position,
80            word_id: token.word_id.id,
81            is_unknown: token.word_id.is_unknown(),
82            details: Some(details),
83        }
84    }
85}
86
87pub fn register(parent_module: &Bound<'_, PyModule>) -> PyResult<()> {
88    let py = parent_module.py();
89    let m = PyModule::new(py, "token")?;
90    m.add_class::<PyToken>()?;
91    parent_module.add_submodule(&m)?;
92    Ok(())
93}