lindera_sqlite/lib.rs
1//! # lindera-sqlite
2//!
3//! A SQLite FTS5 (Full-Text Search 5) tokenizer extension that provides support for
4//! Chinese, Japanese, and Korean (CJK) text analysis using the Lindera morphological analyzer.
5//!
6//! ## Features
7//!
8//! - **CJK Language Support**: Tokenizes Chinese, Japanese, and Korean text using Lindera
9//! - **Multiple Dictionaries**: Supports various embedded dictionaries (IPADIC, UniDic, ko-dic, CC-CEDICT)
10//! - **Configurable**: Uses YAML configuration for character filters and token filters
11//! - **SQLite Integration**: Seamlessly integrates with SQLite's FTS5 full-text search
12//!
13//! ## Usage
14//!
15//! ### Building the Extension
16//!
17//! ```bash
18//! cargo build --release --features=embedded-cjk
19//! ```
20//!
21//! ### Setting Up Configuration
22//!
23//! Set the `LINDERA_CONFIG_PATH` environment variable to point to your Lindera configuration file:
24//!
25//! ```bash
26//! export LINDERA_CONFIG_PATH=./resources/lindera.yml
27//! ```
28//!
29//! ### Loading in SQLite
30//!
31//! ```sql
32//! .load ./target/release/liblindera_sqlite lindera_fts5_tokenizer_init
33//! ```
34//!
35//! ### Creating an FTS5 Table
36//!
37//! ```sql
38//! CREATE VIRTUAL TABLE example USING fts5(content, tokenize='lindera_tokenizer');
39//! ```
40//!
41//! ### Searching
42//!
43//! ```sql
44//! INSERT INTO example(content) VALUES ('日本語の全文検索');
45//! SELECT * FROM example WHERE content MATCH '検索';
46//! ```
47//!
48//! ## Architecture
49//!
50//! This library provides a C ABI interface for SQLite to use Lindera as a custom FTS5 tokenizer.
51//! The main components are:
52//!
53//! - [`load_tokenizer`]: Initializes a Lindera tokenizer with configuration
54//! - [`lindera_fts5_tokenize`]: C-compatible entry point for tokenization (called by SQLite)
55//! - Internal tokenization logic that converts text to tokens and calls back to SQLite
56
57extern crate alloc;
58
59mod common;
60#[cfg(feature = "extension")]
61mod extension;
62
63use libc::{c_char, c_int, c_uchar, c_void};
64
65use lindera::tokenizer::{Tokenizer, TokenizerBuilder};
66
67pub use crate::common::*;
68
69/// Loads and initializes a Lindera tokenizer.
70///
71/// This function creates a new Lindera tokenizer using the configuration specified
72/// by the `LINDERA_CONFIG_PATH` environment variable. The configuration file controls
73/// segmentation mode, character filters, and token filters.
74///
75/// # Returns
76///
77/// - `Ok(Tokenizer)` - Successfully initialized tokenizer
78/// - `Err(c_int)` - Returns [`SQLITE_INTERNAL`] if tokenizer creation fails
79///
80/// # Errors
81///
82/// This function will return an error if:
83/// - The tokenizer builder cannot be created (e.g., missing or invalid configuration)
84/// - The tokenizer cannot be built from the builder
85///
86/// Error messages are written to stderr for debugging purposes.
87///
88/// # Examples
89///
90/// Set the configuration path environment variable before loading:
91///
92/// ```bash
93/// export LINDERA_CONFIG_PATH=./resources/lindera.yml
94/// ```
95///
96/// Then load the tokenizer:
97///
98/// ```no_run
99/// # use lindera_sqlite::load_tokenizer;
100/// let tokenizer = load_tokenizer().expect("Failed to load tokenizer");
101/// ```
102#[inline]
103pub fn load_tokenizer() -> Result<Tokenizer, c_int> {
104 let builder = TokenizerBuilder::new().map_err(|e| {
105 eprintln!("Failed to create tokenizer builder: {e}");
106 SQLITE_INTERNAL
107 })?;
108 let tokenizer = builder.build().map_err(|e| {
109 eprintln!("Failed to create tokenizer: {e}");
110 SQLITE_INTERNAL
111 })?;
112
113 Ok(tokenizer)
114}
115
116/// C-compatible FTS5 tokenization function.
117///
118/// This is the main entry point called by SQLite's FTS5 extension to tokenize text.
119/// It follows the FTS5 tokenizer API specification and provides panic safety by catching
120/// any Rust panics that might occur during tokenization.
121///
122/// # Parameters
123///
124/// - `tokenizer` - Pointer to the [`Fts5Tokenizer`] instance
125/// - `p_ctx` - Context pointer passed to the token callback function
126/// - `_flags` - Tokenization flags (currently unused)
127/// - `p_text` - Pointer to the input text buffer (UTF-8 encoded)
128/// - `n_text` - Length of the input text in bytes
129/// - `x_token` - Callback function invoked for each token found
130///
131/// # Returns
132///
133/// - [`SQLITE_OK`] - Tokenization completed successfully
134/// - [`SQLITE_INTERNAL`] - An internal error occurred (including panics)
135/// - Other SQLite error codes propagated from the token callback
136///
137/// # Safety
138///
139/// This function is marked as `unsafe(no_mangle)` and `extern "C"` for FFI compatibility.
140/// It wraps the internal tokenization logic with panic catching to prevent unwinding
141/// across the FFI boundary, which would be undefined behavior.
142///
143/// The caller must ensure:
144/// - `tokenizer` points to a valid [`Fts5Tokenizer`] instance
145/// - `p_text` points to valid UTF-8 data of length `n_text`
146/// - `x_token` is a valid function pointer
147///
148/// # C API Example
149///
150/// ```c
151/// // Called by SQLite FTS5 when tokenizing text
152/// int rc = lindera_fts5_tokenize(
153/// tokenizer,
154/// context,
155/// 0,
156/// "日本語テキスト",
157/// strlen("日本語テキスト"),
158/// my_token_callback
159/// );
160/// ```
161#[unsafe(no_mangle)]
162pub extern "C" fn lindera_fts5_tokenize(
163 tokenizer: *mut Fts5Tokenizer,
164 p_ctx: *mut c_void,
165 _flags: c_int,
166 p_text: *const c_char,
167 n_text: c_int,
168 x_token: TokenFunction,
169) -> c_int {
170 crate::common::ffi_panic_boundary(|| {
171 lindera_fts5_tokenize_internal(tokenizer, p_ctx, p_text, n_text, x_token)?;
172 Ok(())
173 })
174}
175
176/// Internal tokenization implementation.
177///
178/// Performs the actual tokenization of input text and invokes the callback function
179/// for each token produced. This function handles UTF-8 validation, tokenization,
180/// and error propagation.
181///
182/// # Parameters
183///
184/// - `tokenizer` - Pointer to the [`Fts5Tokenizer`] instance
185/// - `p_ctx` - Context pointer to pass to the token callback
186/// - `p_text` - Raw pointer to UTF-8 encoded text
187/// - `n_text` - Length of text in bytes
188/// - `x_token` - Callback function to invoke for each token
189///
190/// # Returns
191///
192/// - `Ok(())` - All tokens processed successfully
193/// - `Err(SQLITE_OK)` - Invalid UTF-8 input (treated as success to keep database accessible)
194/// - `Err(SQLITE_INTERNAL)` - Tokenization failed
195/// - `Err(code)` - Error code returned by the token callback
196///
197/// # Safety
198///
199/// This function performs unsafe operations:
200/// - Dereferences raw pointers (`tokenizer`, `p_text`)
201/// - Creates slices from raw pointer and length
202///
203/// The caller must ensure all pointers are valid and properly aligned.
204///
205/// # Error Handling
206///
207/// - **UTF-8 Errors**: Mapped to [`SQLITE_OK`] to prevent database inaccessibility
208/// - **Tokenization Errors**: Return [`SQLITE_INTERNAL`]
209/// - **Callback Errors**: Propagated immediately, stopping tokenization
210///
211/// # Token Callback Protocol
212///
213/// For each token, the callback is invoked with:
214/// - `p_ctx` - Context pointer (unchanged)
215/// - `0` - Flags (currently always 0)
216/// - Token surface as C string pointer
217/// - Token length in bytes
218/// - Byte offset of token start in original text
219/// - Byte offset of token end in original text
220#[inline]
221fn lindera_fts5_tokenize_internal(
222 tokenizer: *mut Fts5Tokenizer,
223 p_ctx: *mut c_void,
224 p_text: *const c_char,
225 n_text: c_int,
226 x_token: TokenFunction,
227) -> Result<(), c_int> {
228 if n_text <= 0 {
229 return Ok(());
230 }
231
232 let input = unsafe { InputText::from_raw_parts(p_text, n_text)? };
233 let mut tokenizer = unsafe { TokenizerHandle::new(tokenizer)? };
234 let callback = crate::common::TokenCallback::new(p_ctx, x_token);
235
236 tokenizer.emit_tokens(input.as_str(), &callback)
237}
238
239struct TokenizerHandle<'a> {
240 inner: &'a mut Fts5Tokenizer,
241}
242
243impl<'a> TokenizerHandle<'a> {
244 unsafe fn new(ptr: *mut Fts5Tokenizer) -> Result<Self, c_int> {
245 let inner = unsafe { ptr.as_mut() }.ok_or(SQLITE_INTERNAL)?;
246 Ok(Self { inner })
247 }
248
249 fn emit_tokens(
250 &mut self,
251 input: &str,
252 callback: &crate::common::TokenCallback,
253 ) -> Result<(), c_int> {
254 let tokens = self
255 .inner
256 .tokenizer
257 .tokenize(input)
258 .map_err(|_| SQLITE_INTERNAL)?;
259
260 for token in tokens {
261 callback.emit(token.surface.as_bytes(), token.byte_start, token.byte_end)?;
262 }
263
264 Ok(())
265 }
266}
267
268struct InputText<'a> {
269 text: &'a str,
270}
271
272impl<'a> InputText<'a> {
273 unsafe fn from_raw_parts(ptr: *const c_char, len: c_int) -> Result<Self, c_int> {
274 let slice = unsafe { core::slice::from_raw_parts(ptr as *const c_uchar, len as usize) };
275 let text = core::str::from_utf8(slice).map_err(|_| SQLITE_OK)?;
276 Ok(Self { text })
277 }
278
279 fn as_str(&self) -> &str {
280 self.text
281 }
282}
283
284#[cfg(test)]
285mod tests {
286 use super::*;
287
288 extern "C" fn token_callback(
289 ctx: *mut c_void,
290 flags: c_int,
291 token: *const c_char,
292 token_len: c_int,
293 start: c_int,
294 end: c_int,
295 ) -> c_int {
296 assert_eq!(flags, 0);
297
298 let tokens_ptr = ctx as *mut _ as *mut Vec<(String, c_int, c_int)>;
299 let tokens = unsafe { tokens_ptr.as_mut() }.expect("tokens pointer");
300 let slice =
301 unsafe { core::slice::from_raw_parts(token as *const c_uchar, token_len as usize) };
302 let token = String::from_utf8(slice.to_vec()).expect("Expected utf-8 token");
303
304 tokens.push((token, start, end));
305
306 return SQLITE_OK;
307 }
308
309 #[test]
310 fn it_emits_segments() {
311 let input = "Linderaは形態素解析エンジンです。ユーザー辞書も利用可能です。";
312 let mut tokens: Vec<(String, c_int, c_int)> = vec![];
313
314 let mut tokenizer = Fts5Tokenizer {
315 tokenizer: load_tokenizer().unwrap(),
316 };
317 lindera_fts5_tokenize_internal(
318 &mut tokenizer,
319 &mut tokens as *mut _ as *mut c_void,
320 input.as_bytes().as_ptr() as *const c_char,
321 input.len() as i32,
322 token_callback,
323 )
324 .expect("tokenize internal should not fail");
325
326 assert_eq!(
327 tokens,
328 [
329 ("Lindera", 0, 21),
330 ("形態素", 24, 33),
331 ("解析", 33, 39),
332 ("エンジン", 39, 54),
333 ("ユーザ", 63, 75),
334 ("辞書", 75, 81),
335 ("利用", 84, 90),
336 ("可能", 90, 96)
337 ]
338 .map(|(s, start, end)| (s.to_owned(), start, end))
339 );
340 }
341
342 #[test]
343 fn it_ignores_invalid_utf8() {
344 let input = b"\xc3\x28";
345 let mut tokens: Vec<(String, c_int, c_int)> = vec![];
346
347 let mut tokenizer = Fts5Tokenizer {
348 tokenizer: load_tokenizer().unwrap(),
349 };
350 assert_eq!(
351 lindera_fts5_tokenize_internal(
352 &mut tokenizer,
353 &mut tokens as *mut _ as *mut c_void,
354 input.as_ptr() as *const c_char,
355 input.len() as i32,
356 token_callback,
357 )
358 .expect_err("tokenize internal should not fail"),
359 SQLITE_OK
360 );
361
362 assert_eq!(tokens, []);
363 }
364}