lindera_sqlite/lib.rs
1//! # lindera-sqlite
2//!
3//! A SQLite FTS5 (Full-Text Search 5) tokenizer extension that provides support for
4//! Chinese, Japanese, and Korean (CJK) text analysis using the Lindera morphological analyzer.
5//!
6//! ## Features
7//!
8//! - **CJK Language Support**: Tokenizes Chinese, Japanese, and Korean text using Lindera
9//! - **Multiple Dictionaries**: Supports various embedded dictionaries (IPADIC, UniDic, ko-dic, CC-CEDICT)
10//! - **Configurable**: Uses YAML configuration for character filters and token filters
11//! - **SQLite Integration**: Seamlessly integrates with SQLite's FTS5 full-text search
12//!
13//! ## Usage
14//!
15//! ### Building the Extension
16//!
17//! ```bash
18//! cargo build --release --features=embedded-cjk
19//! ```
20//!
21//! ### Setting Up Configuration
22//!
23//! Set the `LINDERA_CONFIG_PATH` environment variable to point to your Lindera configuration file:
24//!
25//! ```bash
26//! export LINDERA_CONFIG_PATH=./resources/lindera.yml
27//! ```
28//!
29//! ### Loading in SQLite
30//!
31//! ```sql
32//! .load ./target/release/liblindera_sqlite lindera_fts5_tokenizer_init
33//! ```
34//!
35//! ### Creating an FTS5 Table
36//!
37//! ```sql
38//! CREATE VIRTUAL TABLE example USING fts5(content, tokenize='lindera_tokenizer');
39//! ```
40//!
41//! ### Searching
42//!
43//! ```sql
44//! INSERT INTO example(content) VALUES ('日本語の全文検索');
45//! SELECT * FROM example WHERE content MATCH '検索';
46//! ```
47//!
48//! ## Architecture
49//!
50//! This library provides a C ABI interface for SQLite to use Lindera as a custom FTS5 tokenizer.
51//! The main components are:
52//!
53//! - [`load_tokenizer`]: Initializes a Lindera tokenizer with configuration
54//! - [`lindera_fts5_tokenize`]: C-compatible entry point for tokenization (called by SQLite)
55//! - Internal tokenization logic that converts text to tokens and calls back to SQLite
56
57extern crate alloc;
58
59mod common;
60#[cfg(feature = "extension")]
61mod extension;
62
63use libc::{c_char, c_int, c_uchar, c_void};
64
65use lindera::tokenizer::{Tokenizer, TokenizerBuilder};
66
67pub use crate::common::*;
68
69/// Loads and initializes a Lindera tokenizer.
70///
71/// This function creates a new Lindera tokenizer using the configuration specified
72/// by the `LINDERA_CONFIG_PATH` environment variable. The configuration file controls
73/// segmentation mode, character filters, and token filters.
74///
75/// # Returns
76///
77/// - `Ok(Tokenizer)` - Successfully initialized tokenizer
78/// - `Err(c_int)` - Returns [`SQLITE_INTERNAL`] if tokenizer creation fails
79///
80/// # Errors
81///
82/// This function will return an error if:
83/// - The tokenizer builder cannot be created (e.g., missing or invalid configuration)
84/// - The tokenizer cannot be built from the builder
85///
86/// Error messages are written to stderr for debugging purposes.
87///
88/// # Examples
89///
90/// Set the configuration path environment variable before loading:
91///
92/// ```bash
93/// export LINDERA_CONFIG_PATH=./resources/lindera.yml
94/// ```
95///
96/// Then load the tokenizer:
97///
98/// ```no_run
99/// # use lindera_sqlite::load_tokenizer;
100/// let tokenizer = load_tokenizer().expect("Failed to load tokenizer");
101/// ```
102#[inline]
103pub fn load_tokenizer() -> Result<Tokenizer, c_int> {
104 let builder = TokenizerBuilder::new().map_err(|e| {
105 eprintln!("Failed to create tokenizer builder: {e}");
106 SQLITE_INTERNAL
107 })?;
108 let tokenizer = builder.build().map_err(|e| {
109 eprintln!("Failed to create tokenizer: {e}");
110 SQLITE_INTERNAL
111 })?;
112
113 Ok(tokenizer)
114}
115
116/// C-compatible FTS5 tokenization function.
117///
118/// This is the main entry point called by SQLite's FTS5 extension to tokenize text.
119/// It follows the FTS5 tokenizer API specification and provides panic safety by catching
120/// any Rust panics that might occur during tokenization.
121///
122/// # Parameters
123///
124/// - `tokenizer` - Pointer to the [`Fts5Tokenizer`] instance
125/// - `p_ctx` - Context pointer passed to the token callback function
126/// - `_flags` - Tokenization flags (currently unused)
127/// - `p_text` - Pointer to the input text buffer (UTF-8 encoded)
128/// - `n_text` - Length of the input text in bytes
129/// - `x_token` - Callback function invoked for each token found
130///
131/// # Returns
132///
133/// - [`SQLITE_OK`] - Tokenization completed successfully
134/// - [`SQLITE_INTERNAL`] - An internal error occurred (including panics)
135/// - Other SQLite error codes propagated from the token callback
136///
137/// # Safety
138///
139/// This function is marked as `unsafe(no_mangle)` and `extern "C"` for FFI compatibility.
140/// It wraps the internal tokenization logic with panic catching to prevent unwinding
141/// across the FFI boundary, which would be undefined behavior.
142///
143/// The caller must ensure:
144/// - `tokenizer` points to a valid [`Fts5Tokenizer`] instance
145/// - `p_text` points to valid UTF-8 data of length `n_text`
146/// - `x_token` is a valid function pointer
147///
148/// # C API Example
149///
150/// ```c
151/// // Called by SQLite FTS5 when tokenizing text
152/// int rc = lindera_fts5_tokenize(
153/// tokenizer,
154/// context,
155/// 0,
156/// "日本語テキスト",
157/// strlen("日本語テキスト"),
158/// my_token_callback
159/// );
160/// ```
161#[unsafe(no_mangle)]
162pub extern "C" fn lindera_fts5_tokenize(
163 tokenizer: *mut Fts5Tokenizer,
164 p_ctx: *mut c_void,
165 _flags: c_int,
166 p_text: *const c_char,
167 n_text: c_int,
168 x_token: TokenFunction,
169) -> c_int {
170 std::panic::catch_unwind(std::panic::AssertUnwindSafe(
171 || match lindera_fts5_tokenize_internal(tokenizer, p_ctx, p_text, n_text, x_token) {
172 Ok(()) => SQLITE_OK,
173 Err(code) => code,
174 },
175 ))
176 .unwrap_or(SQLITE_INTERNAL)
177}
178
179/// Internal tokenization implementation.
180///
181/// Performs the actual tokenization of input text and invokes the callback function
182/// for each token produced. This function handles UTF-8 validation, tokenization,
183/// and error propagation.
184///
185/// # Parameters
186///
187/// - `tokenizer` - Pointer to the [`Fts5Tokenizer`] instance
188/// - `p_ctx` - Context pointer to pass to the token callback
189/// - `p_text` - Raw pointer to UTF-8 encoded text
190/// - `n_text` - Length of text in bytes
191/// - `x_token` - Callback function to invoke for each token
192///
193/// # Returns
194///
195/// - `Ok(())` - All tokens processed successfully
196/// - `Err(SQLITE_OK)` - Invalid UTF-8 input (treated as success to keep database accessible)
197/// - `Err(SQLITE_INTERNAL)` - Tokenization failed
198/// - `Err(code)` - Error code returned by the token callback
199///
200/// # Safety
201///
202/// This function performs unsafe operations:
203/// - Dereferences raw pointers (`tokenizer`, `p_text`)
204/// - Creates slices from raw pointer and length
205///
206/// The caller must ensure all pointers are valid and properly aligned.
207///
208/// # Error Handling
209///
210/// - **UTF-8 Errors**: Mapped to [`SQLITE_OK`] to prevent database inaccessibility
211/// - **Tokenization Errors**: Return [`SQLITE_INTERNAL`]
212/// - **Callback Errors**: Propagated immediately, stopping tokenization
213///
214/// # Token Callback Protocol
215///
216/// For each token, the callback is invoked with:
217/// - `p_ctx` - Context pointer (unchanged)
218/// - `0` - Flags (currently always 0)
219/// - Token surface as C string pointer
220/// - Token length in bytes
221/// - Byte offset of token start in original text
222/// - Byte offset of token end in original text
223#[inline]
224fn lindera_fts5_tokenize_internal(
225 tokenizer: *mut Fts5Tokenizer,
226 p_ctx: *mut c_void,
227 p_text: *const c_char,
228 n_text: c_int,
229 x_token: TokenFunction,
230) -> Result<(), c_int> {
231 if n_text <= 0 {
232 return Ok(());
233 }
234
235 let slice = unsafe { core::slice::from_raw_parts(p_text as *const c_uchar, n_text as usize) };
236
237 // Map errors to SQLITE_OK because failing here means that the database
238 // wouldn't accessible.
239 let input = core::str::from_utf8(slice).map_err(|_| SQLITE_OK)?;
240
241 match unsafe { (*tokenizer).tokenizer.tokenize(input) } {
242 Ok(tokens) => {
243 for token in tokens {
244 let rc = x_token(
245 p_ctx,
246 0,
247 token.surface.as_bytes().as_ptr() as *const c_char,
248 token.surface.len() as c_int,
249 token.byte_start as c_int,
250 token.byte_end as c_int,
251 );
252 if rc != SQLITE_OK {
253 return Err(rc);
254 }
255 }
256 }
257 Err(_) => {
258 return Err(SQLITE_INTERNAL);
259 }
260 }
261
262 Ok(())
263}
264
265#[cfg(test)]
266mod tests {
267 use super::*;
268
269 extern "C" fn token_callback(
270 ctx: *mut c_void,
271 flags: c_int,
272 token: *const c_char,
273 token_len: c_int,
274 start: c_int,
275 end: c_int,
276 ) -> c_int {
277 assert_eq!(flags, 0);
278
279 let tokens_ptr = ctx as *mut _ as *mut Vec<(String, c_int, c_int)>;
280 let tokens = unsafe { tokens_ptr.as_mut() }.expect("tokens pointer");
281 let slice =
282 unsafe { core::slice::from_raw_parts(token as *const c_uchar, token_len as usize) };
283 let token = String::from_utf8(slice.to_vec()).expect("Expected utf-8 token");
284
285 tokens.push((token, start, end));
286
287 return SQLITE_OK;
288 }
289
290 #[test]
291 fn it_emits_segments() {
292 let input = "Linderaは形態素解析エンジンです。ユーザー辞書も利用可能です。";
293 let mut tokens: Vec<(String, c_int, c_int)> = vec![];
294
295 let mut tokenizer = Fts5Tokenizer {
296 tokenizer: load_tokenizer().unwrap(),
297 };
298 lindera_fts5_tokenize_internal(
299 &mut tokenizer,
300 &mut tokens as *mut _ as *mut c_void,
301 input.as_bytes().as_ptr() as *const c_char,
302 input.len() as i32,
303 token_callback,
304 )
305 .expect("tokenize internal should not fail");
306
307 assert_eq!(
308 tokens,
309 [
310 ("Lindera", 0, 21),
311 ("形態素", 24, 33),
312 ("解析", 33, 39),
313 ("エンジン", 39, 54),
314 ("ユーザ", 63, 75),
315 ("辞書", 75, 81),
316 ("利用", 84, 90),
317 ("可能", 90, 96)
318 ]
319 .map(|(s, start, end)| (s.to_owned(), start, end))
320 );
321 }
322
323 #[test]
324 fn it_ignores_invalid_utf8() {
325 let input = b"\xc3\x28";
326 let mut tokens: Vec<(String, c_int, c_int)> = vec![];
327
328 let mut tokenizer = Fts5Tokenizer {
329 tokenizer: load_tokenizer().unwrap(),
330 };
331 assert_eq!(
332 lindera_fts5_tokenize_internal(
333 &mut tokenizer,
334 &mut tokens as *mut _ as *mut c_void,
335 input.as_ptr() as *const c_char,
336 input.len() as i32,
337 token_callback,
338 )
339 .expect_err("tokenize internal should not fail"),
340 SQLITE_OK
341 );
342
343 assert_eq!(tokens, []);
344 }
345}