lindera_sqlite/
lib.rs

1extern crate alloc;
2
3mod common;
4#[cfg(feature = "extension")]
5mod extension;
6
7use libc::{c_char, c_int, c_uchar, c_void};
8
9use lindera::tokenizer::{Tokenizer, TokenizerBuilder};
10
11pub use crate::common::*;
12
13pub fn load_tokenizer() -> Result<Tokenizer, c_int> {
14    let builder = TokenizerBuilder::new().map_err(|e| {
15        eprintln!("Failed to create tokenizer builder: {}", e);
16        SQLITE_INTERNAL
17    })?;
18    let tokenizer = builder.build().map_err(|e| {
19        eprintln!("Failed to create tokenizer: {}", e);
20        SQLITE_INTERNAL
21    })?;
22
23    Ok(tokenizer)
24}
25
26#[no_mangle]
27pub extern "C" fn lindera_fts5_tokenize(
28    tokenizer: *mut Fts5Tokenizer,
29    p_ctx: *mut c_void,
30    _flags: c_int,
31    p_text: *const c_char,
32    n_text: c_int,
33    x_token: TokenFunction,
34) -> c_int {
35    std::panic::catch_unwind(std::panic::AssertUnwindSafe(
36        || match lindera_fts5_tokenize_internal(tokenizer, p_ctx, p_text, n_text, x_token) {
37            Ok(()) => SQLITE_OK,
38            Err(code) => code,
39        },
40    ))
41    .unwrap_or(SQLITE_INTERNAL)
42}
43
44fn lindera_fts5_tokenize_internal(
45    tokenizer: *mut Fts5Tokenizer,
46    p_ctx: *mut c_void,
47    p_text: *const c_char,
48    n_text: c_int,
49    x_token: TokenFunction,
50) -> Result<(), c_int> {
51    let slice = unsafe { core::slice::from_raw_parts(p_text as *const c_uchar, n_text as usize) };
52
53    // Map errors to SQLITE_OK because failing here means that the database
54    // wouldn't accessible.
55    let input = core::str::from_utf8(slice).map_err(|_| SQLITE_OK)?;
56
57    match unsafe { (*tokenizer).tokenizer.tokenize(input) } {
58        Ok(tokens) => {
59            for token in tokens {
60                let rc = x_token(
61                    p_ctx,
62                    0,
63                    token.text.as_bytes().as_ptr() as *const c_char,
64                    token.text.len() as c_int,
65                    token.byte_start as c_int,
66                    token.byte_end as c_int,
67                );
68                if rc != SQLITE_OK {
69                    return Err(rc);
70                }
71            }
72        }
73        Err(_) => {
74            return Err(SQLITE_INTERNAL);
75        }
76    }
77
78    Ok(())
79}
80
81#[cfg(test)]
82mod tests {
83    use super::*;
84
85    extern "C" fn token_callback(
86        ctx: *mut c_void,
87        flags: c_int,
88        token: *const c_char,
89        token_len: c_int,
90        start: c_int,
91        end: c_int,
92    ) -> c_int {
93        assert_eq!(flags, 0);
94
95        let tokens_ptr = ctx as *mut _ as *mut Vec<(String, c_int, c_int)>;
96        let tokens = unsafe { tokens_ptr.as_mut() }.expect("tokens pointer");
97        let slice =
98            unsafe { core::slice::from_raw_parts(token as *const c_uchar, token_len as usize) };
99        let token = String::from_utf8(slice.to_vec()).expect("Expected utf-8 token");
100
101        tokens.push((token, start, end));
102
103        return SQLITE_OK;
104    }
105
106    #[test]
107    fn it_emits_segments() {
108        let input = "Linderaは形態素解析エンジンです。ユーザー辞書も利用可能です。";
109        let mut tokens: Vec<(String, c_int, c_int)> = vec![];
110
111        let mut tokenizer = Fts5Tokenizer {
112            tokenizer: load_tokenizer().unwrap(),
113        };
114        lindera_fts5_tokenize_internal(
115            &mut tokenizer,
116            &mut tokens as *mut _ as *mut c_void,
117            input.as_bytes().as_ptr() as *const c_char,
118            input.len() as i32,
119            token_callback,
120        )
121        .expect("tokenize internal should not fail");
122
123        assert_eq!(
124            tokens,
125            [
126                ("Lindera", 0, 21),
127                ("形態素", 24, 33),
128                ("解析", 33, 39),
129                ("エンジン", 39, 54),
130                ("ユーザ", 63, 75),
131                ("辞書", 75, 81),
132                ("利用", 84, 90),
133                ("可能", 90, 96)
134            ]
135            .map(|(s, start, end)| (s.to_owned(), start, end))
136        );
137    }
138
139    #[test]
140    fn it_ignores_invalid_utf8() {
141        let input = b"\xc3\x28";
142        let mut tokens: Vec<(String, c_int, c_int)> = vec![];
143
144        let mut tokenizer = Fts5Tokenizer {
145            tokenizer: load_tokenizer().unwrap(),
146        };
147        assert_eq!(
148            lindera_fts5_tokenize_internal(
149                &mut tokenizer,
150                &mut tokens as *mut _ as *mut c_void,
151                input.as_ptr() as *const c_char,
152                input.len() as i32,
153                token_callback,
154            )
155            .expect_err("tokenize internal should not fail"),
156            SQLITE_OK
157        );
158
159        assert_eq!(tokens, []);
160    }
161}