lindera_sqlite/
lib.rs

1extern crate alloc;
2
3mod common;
4#[cfg(feature = "extension")]
5mod extension;
6
7use libc::{c_char, c_int, c_uchar, c_void};
8
9use lindera::tokenizer::{Tokenizer, TokenizerBuilder};
10
11pub use crate::common::*;
12
13#[inline]
14pub fn load_tokenizer() -> Result<Tokenizer, c_int> {
15    let builder = TokenizerBuilder::new().map_err(|e| {
16        eprintln!("Failed to create tokenizer builder: {e}");
17        SQLITE_INTERNAL
18    })?;
19    let tokenizer = builder.build().map_err(|e| {
20        eprintln!("Failed to create tokenizer: {e}");
21        SQLITE_INTERNAL
22    })?;
23
24    Ok(tokenizer)
25}
26
27#[unsafe(no_mangle)]
28pub extern "C" fn lindera_fts5_tokenize(
29    tokenizer: *mut Fts5Tokenizer,
30    p_ctx: *mut c_void,
31    _flags: c_int,
32    p_text: *const c_char,
33    n_text: c_int,
34    x_token: TokenFunction,
35) -> c_int {
36    std::panic::catch_unwind(std::panic::AssertUnwindSafe(
37        || match lindera_fts5_tokenize_internal(tokenizer, p_ctx, p_text, n_text, x_token) {
38            Ok(()) => SQLITE_OK,
39            Err(code) => code,
40        },
41    ))
42    .unwrap_or(SQLITE_INTERNAL)
43}
44
45#[inline]
46fn lindera_fts5_tokenize_internal(
47    tokenizer: *mut Fts5Tokenizer,
48    p_ctx: *mut c_void,
49    p_text: *const c_char,
50    n_text: c_int,
51    x_token: TokenFunction,
52) -> Result<(), c_int> {
53    if n_text <= 0 {
54        return Ok(());
55    }
56
57    let slice = unsafe { core::slice::from_raw_parts(p_text as *const c_uchar, n_text as usize) };
58
59    // Map errors to SQLITE_OK because failing here means that the database
60    // wouldn't accessible.
61    let input = core::str::from_utf8(slice).map_err(|_| SQLITE_OK)?;
62
63    match unsafe { (*tokenizer).tokenizer.tokenize(input) } {
64        Ok(tokens) => {
65            for token in tokens {
66                let rc = x_token(
67                    p_ctx,
68                    0,
69                    token.surface.as_bytes().as_ptr() as *const c_char,
70                    token.surface.len() as c_int,
71                    token.byte_start as c_int,
72                    token.byte_end as c_int,
73                );
74                if rc != SQLITE_OK {
75                    return Err(rc);
76                }
77            }
78        }
79        Err(_) => {
80            return Err(SQLITE_INTERNAL);
81        }
82    }
83
84    Ok(())
85}
86
87#[cfg(test)]
88mod tests {
89    use super::*;
90
91    extern "C" fn token_callback(
92        ctx: *mut c_void,
93        flags: c_int,
94        token: *const c_char,
95        token_len: c_int,
96        start: c_int,
97        end: c_int,
98    ) -> c_int {
99        assert_eq!(flags, 0);
100
101        let tokens_ptr = ctx as *mut _ as *mut Vec<(String, c_int, c_int)>;
102        let tokens = unsafe { tokens_ptr.as_mut() }.expect("tokens pointer");
103        let slice =
104            unsafe { core::slice::from_raw_parts(token as *const c_uchar, token_len as usize) };
105        let token = String::from_utf8(slice.to_vec()).expect("Expected utf-8 token");
106
107        tokens.push((token, start, end));
108
109        return SQLITE_OK;
110    }
111
112    #[test]
113    fn it_emits_segments() {
114        let input = "Linderaは形態素解析エンジンです。ユーザー辞書も利用可能です。";
115        let mut tokens: Vec<(String, c_int, c_int)> = vec![];
116
117        let mut tokenizer = Fts5Tokenizer {
118            tokenizer: load_tokenizer().unwrap(),
119        };
120        lindera_fts5_tokenize_internal(
121            &mut tokenizer,
122            &mut tokens as *mut _ as *mut c_void,
123            input.as_bytes().as_ptr() as *const c_char,
124            input.len() as i32,
125            token_callback,
126        )
127        .expect("tokenize internal should not fail");
128
129        assert_eq!(
130            tokens,
131            [
132                ("Lindera", 0, 21),
133                ("形態素", 24, 33),
134                ("解析", 33, 39),
135                ("エンジン", 39, 54),
136                ("ユーザ", 63, 75),
137                ("辞書", 75, 81),
138                ("利用", 84, 90),
139                ("可能", 90, 96)
140            ]
141            .map(|(s, start, end)| (s.to_owned(), start, end))
142        );
143    }
144
145    #[test]
146    fn it_ignores_invalid_utf8() {
147        let input = b"\xc3\x28";
148        let mut tokens: Vec<(String, c_int, c_int)> = vec![];
149
150        let mut tokenizer = Fts5Tokenizer {
151            tokenizer: load_tokenizer().unwrap(),
152        };
153        assert_eq!(
154            lindera_fts5_tokenize_internal(
155                &mut tokenizer,
156                &mut tokens as *mut _ as *mut c_void,
157                input.as_ptr() as *const c_char,
158                input.len() as i32,
159                token_callback,
160            )
161            .expect_err("tokenize internal should not fail"),
162            SQLITE_OK
163        );
164
165        assert_eq!(tokens, []);
166    }
167}