1extern crate alloc;
2
3mod common;
4#[cfg(feature = "extension")]
5mod extension;
6
7use libc::{c_char, c_int, c_uchar, c_void};
8
9use lindera::tokenizer::{Tokenizer, TokenizerBuilder};
10
11pub use crate::common::*;
12
13#[inline]
14pub fn load_tokenizer() -> Result<Tokenizer, c_int> {
15 let builder = TokenizerBuilder::new().map_err(|e| {
16 eprintln!("Failed to create tokenizer builder: {e}");
17 SQLITE_INTERNAL
18 })?;
19 let tokenizer = builder.build().map_err(|e| {
20 eprintln!("Failed to create tokenizer: {e}");
21 SQLITE_INTERNAL
22 })?;
23
24 Ok(tokenizer)
25}
26
27#[unsafe(no_mangle)]
28pub extern "C" fn lindera_fts5_tokenize(
29 tokenizer: *mut Fts5Tokenizer,
30 p_ctx: *mut c_void,
31 _flags: c_int,
32 p_text: *const c_char,
33 n_text: c_int,
34 x_token: TokenFunction,
35) -> c_int {
36 std::panic::catch_unwind(std::panic::AssertUnwindSafe(
37 || match lindera_fts5_tokenize_internal(tokenizer, p_ctx, p_text, n_text, x_token) {
38 Ok(()) => SQLITE_OK,
39 Err(code) => code,
40 },
41 ))
42 .unwrap_or(SQLITE_INTERNAL)
43}
44
45#[inline]
46fn lindera_fts5_tokenize_internal(
47 tokenizer: *mut Fts5Tokenizer,
48 p_ctx: *mut c_void,
49 p_text: *const c_char,
50 n_text: c_int,
51 x_token: TokenFunction,
52) -> Result<(), c_int> {
53 if n_text <= 0 {
54 return Ok(());
55 }
56
57 let slice = unsafe { core::slice::from_raw_parts(p_text as *const c_uchar, n_text as usize) };
58
59 let input = core::str::from_utf8(slice).map_err(|_| SQLITE_OK)?;
62
63 match unsafe { (*tokenizer).tokenizer.tokenize(input) } {
64 Ok(tokens) => {
65 for token in tokens {
66 let rc = x_token(
67 p_ctx,
68 0,
69 token.text.as_bytes().as_ptr() as *const c_char,
70 token.text.len() as c_int,
71 token.byte_start as c_int,
72 token.byte_end as c_int,
73 );
74 if rc != SQLITE_OK {
75 return Err(rc);
76 }
77 }
78 }
79 Err(_) => {
80 return Err(SQLITE_INTERNAL);
81 }
82 }
83
84 Ok(())
85}
86
87#[cfg(test)]
88mod tests {
89 use super::*;
90
91 extern "C" fn token_callback(
92 ctx: *mut c_void,
93 flags: c_int,
94 token: *const c_char,
95 token_len: c_int,
96 start: c_int,
97 end: c_int,
98 ) -> c_int {
99 assert_eq!(flags, 0);
100
101 let tokens_ptr = ctx as *mut _ as *mut Vec<(String, c_int, c_int)>;
102 let tokens = unsafe { tokens_ptr.as_mut() }.expect("tokens pointer");
103 let slice =
104 unsafe { core::slice::from_raw_parts(token as *const c_uchar, token_len as usize) };
105 let token = String::from_utf8(slice.to_vec()).expect("Expected utf-8 token");
106
107 tokens.push((token, start, end));
108
109 return SQLITE_OK;
110 }
111
112 #[test]
113 fn it_emits_segments() {
114 let input = "Linderaは形態素解析エンジンです。ユーザー辞書も利用可能です。";
115 let mut tokens: Vec<(String, c_int, c_int)> = vec![];
116
117 let mut tokenizer = Fts5Tokenizer {
118 tokenizer: load_tokenizer().unwrap(),
119 };
120 lindera_fts5_tokenize_internal(
121 &mut tokenizer,
122 &mut tokens as *mut _ as *mut c_void,
123 input.as_bytes().as_ptr() as *const c_char,
124 input.len() as i32,
125 token_callback,
126 )
127 .expect("tokenize internal should not fail");
128
129 assert_eq!(
130 tokens,
131 [
132 ("Lindera", 0, 21),
133 ("形態素", 24, 33),
134 ("解析", 33, 39),
135 ("エンジン", 39, 54),
136 ("ユーザ", 63, 75),
137 ("辞書", 75, 81),
138 ("利用", 84, 90),
139 ("可能", 90, 96)
140 ]
141 .map(|(s, start, end)| (s.to_owned(), start, end))
142 );
143 }
144
145 #[test]
146 fn it_ignores_invalid_utf8() {
147 let input = b"\xc3\x28";
148 let mut tokens: Vec<(String, c_int, c_int)> = vec![];
149
150 let mut tokenizer = Fts5Tokenizer {
151 tokenizer: load_tokenizer().unwrap(),
152 };
153 assert_eq!(
154 lindera_fts5_tokenize_internal(
155 &mut tokenizer,
156 &mut tokens as *mut _ as *mut c_void,
157 input.as_ptr() as *const c_char,
158 input.len() as i32,
159 token_callback,
160 )
161 .expect_err("tokenize internal should not fail"),
162 SQLITE_OK
163 );
164
165 assert_eq!(tokens, []);
166 }
167}