1extern crate alloc;
2
3mod common;
4#[cfg(feature = "extension")]
5mod extension;
6
7use libc::{c_char, c_int, c_uchar, c_void};
8
9use lindera::tokenizer::{Tokenizer, TokenizerBuilder};
10
11pub use crate::common::*;
12
13pub fn load_tokenizer() -> Result<Tokenizer, c_int> {
14 let builder = TokenizerBuilder::new().map_err(|e| {
15 eprintln!("Failed to create tokenizer builder: {e}");
16 SQLITE_INTERNAL
17 })?;
18 let tokenizer = builder.build().map_err(|e| {
19 eprintln!("Failed to create tokenizer: {e}");
20 SQLITE_INTERNAL
21 })?;
22
23 Ok(tokenizer)
24}
25
26#[unsafe(no_mangle)]
27pub extern "C" fn lindera_fts5_tokenize(
28 tokenizer: *mut Fts5Tokenizer,
29 p_ctx: *mut c_void,
30 _flags: c_int,
31 p_text: *const c_char,
32 n_text: c_int,
33 x_token: TokenFunction,
34) -> c_int {
35 std::panic::catch_unwind(std::panic::AssertUnwindSafe(
36 || match lindera_fts5_tokenize_internal(tokenizer, p_ctx, p_text, n_text, x_token) {
37 Ok(()) => SQLITE_OK,
38 Err(code) => code,
39 },
40 ))
41 .unwrap_or(SQLITE_INTERNAL)
42}
43
44fn lindera_fts5_tokenize_internal(
45 tokenizer: *mut Fts5Tokenizer,
46 p_ctx: *mut c_void,
47 p_text: *const c_char,
48 n_text: c_int,
49 x_token: TokenFunction,
50) -> Result<(), c_int> {
51 let slice = unsafe { core::slice::from_raw_parts(p_text as *const c_uchar, n_text as usize) };
52
53 let input = core::str::from_utf8(slice).map_err(|_| SQLITE_OK)?;
56
57 match unsafe { (*tokenizer).tokenizer.tokenize(input) } {
58 Ok(tokens) => {
59 for token in tokens {
60 let rc = x_token(
61 p_ctx,
62 0,
63 token.text.as_bytes().as_ptr() as *const c_char,
64 token.text.len() as c_int,
65 token.byte_start as c_int,
66 token.byte_end as c_int,
67 );
68 if rc != SQLITE_OK {
69 return Err(rc);
70 }
71 }
72 }
73 Err(_) => {
74 return Err(SQLITE_INTERNAL);
75 }
76 }
77
78 Ok(())
79}
80
81#[cfg(test)]
82mod tests {
83 use super::*;
84
85 extern "C" fn token_callback(
86 ctx: *mut c_void,
87 flags: c_int,
88 token: *const c_char,
89 token_len: c_int,
90 start: c_int,
91 end: c_int,
92 ) -> c_int {
93 assert_eq!(flags, 0);
94
95 let tokens_ptr = ctx as *mut _ as *mut Vec<(String, c_int, c_int)>;
96 let tokens = unsafe { tokens_ptr.as_mut() }.expect("tokens pointer");
97 let slice =
98 unsafe { core::slice::from_raw_parts(token as *const c_uchar, token_len as usize) };
99 let token = String::from_utf8(slice.to_vec()).expect("Expected utf-8 token");
100
101 tokens.push((token, start, end));
102
103 return SQLITE_OK;
104 }
105
106 #[test]
107 fn it_emits_segments() {
108 let input = "Linderaは形態素解析エンジンです。ユーザー辞書も利用可能です。";
109 let mut tokens: Vec<(String, c_int, c_int)> = vec![];
110
111 let mut tokenizer = Fts5Tokenizer {
112 tokenizer: load_tokenizer().unwrap(),
113 };
114 lindera_fts5_tokenize_internal(
115 &mut tokenizer,
116 &mut tokens as *mut _ as *mut c_void,
117 input.as_bytes().as_ptr() as *const c_char,
118 input.len() as i32,
119 token_callback,
120 )
121 .expect("tokenize internal should not fail");
122
123 assert_eq!(
124 tokens,
125 [
126 ("Lindera", 0, 21),
127 ("形態素", 24, 33),
128 ("解析", 33, 39),
129 ("エンジン", 39, 54),
130 ("ユーザ", 63, 75),
131 ("辞書", 75, 81),
132 ("利用", 84, 90),
133 ("可能", 90, 96)
134 ]
135 .map(|(s, start, end)| (s.to_owned(), start, end))
136 );
137 }
138
139 #[test]
140 fn it_ignores_invalid_utf8() {
141 let input = b"\xc3\x28";
142 let mut tokens: Vec<(String, c_int, c_int)> = vec![];
143
144 let mut tokenizer = Fts5Tokenizer {
145 tokenizer: load_tokenizer().unwrap(),
146 };
147 assert_eq!(
148 lindera_fts5_tokenize_internal(
149 &mut tokenizer,
150 &mut tokens as *mut _ as *mut c_void,
151 input.as_ptr() as *const c_char,
152 input.len() as i32,
153 token_callback,
154 )
155 .expect_err("tokenize internal should not fail"),
156 SQLITE_OK
157 );
158
159 assert_eq!(tokens, []);
160 }
161}