lindera_sqlite/
lib.rs

1//! # lindera-sqlite
2//!
3//! A SQLite FTS5 (Full-Text Search 5) tokenizer extension that provides support for
4//! Chinese, Japanese, and Korean (CJK) text analysis using the Lindera morphological analyzer.
5//!
6//! ## Features
7//!
8//! - **CJK Language Support**: Tokenizes Chinese, Japanese, and Korean text using Lindera
9//! - **Multiple Dictionaries**: Supports various embedded dictionaries (IPADIC, UniDic, ko-dic, CC-CEDICT)
10//! - **Configurable**: Uses YAML configuration for character filters and token filters
11//! - **SQLite Integration**: Seamlessly integrates with SQLite's FTS5 full-text search
12//!
13//! ## Usage
14//!
15//! ### Building the Extension
16//!
17//! ```bash
18//! cargo build --release --features=embedded-cjk
19//! ```
20//!
21//! ### Setting Up Configuration
22//!
23//! Set the `LINDERA_CONFIG_PATH` environment variable to point to your Lindera configuration file:
24//!
25//! ```bash
26//! export LINDERA_CONFIG_PATH=./resources/lindera.yml
27//! ```
28//!
29//! ### Loading in SQLite
30//!
31//! ```sql
32//! .load ./target/release/liblindera_sqlite lindera_fts5_tokenizer_init
33//! ```
34//!
35//! ### Creating an FTS5 Table
36//!
37//! ```sql
38//! CREATE VIRTUAL TABLE example USING fts5(content, tokenize='lindera_tokenizer');
39//! ```
40//!
41//! ### Searching
42//!
43//! ```sql
44//! INSERT INTO example(content) VALUES ('日本語の全文検索');
45//! SELECT * FROM example WHERE content MATCH '検索';
46//! ```
47//!
48//! ## Architecture
49//!
50//! This library provides a C ABI interface for SQLite to use Lindera as a custom FTS5 tokenizer.
51//! The main components are:
52//!
53//! - [`load_tokenizer`]: Initializes a Lindera tokenizer with configuration
54//! - [`lindera_fts5_tokenize`]: C-compatible entry point for tokenization (called by SQLite)
55//! - Internal tokenization logic that converts text to tokens and calls back to SQLite
56
57extern crate alloc;
58
59mod common;
60#[cfg(feature = "extension")]
61mod extension;
62
63use libc::{c_char, c_int, c_uchar, c_void};
64
65use lindera::tokenizer::{Tokenizer, TokenizerBuilder};
66
67pub use crate::common::*;
68
69/// Loads and initializes a Lindera tokenizer.
70///
71/// This function creates a new Lindera tokenizer using the configuration specified
72/// by the `LINDERA_CONFIG_PATH` environment variable. The configuration file controls
73/// segmentation mode, character filters, and token filters.
74///
75/// # Returns
76///
77/// - `Ok(Tokenizer)` - Successfully initialized tokenizer
78/// - `Err(c_int)` - Returns [`SQLITE_INTERNAL`] if tokenizer creation fails
79///
80/// # Errors
81///
82/// This function will return an error if:
83/// - The tokenizer builder cannot be created (e.g., missing or invalid configuration)
84/// - The tokenizer cannot be built from the builder
85///
86/// Error messages are written to stderr for debugging purposes.
87///
88/// # Examples
89///
90/// Set the configuration path environment variable before loading:
91///
92/// ```bash
93/// export LINDERA_CONFIG_PATH=./resources/lindera.yml
94/// ```
95///
96/// Then load the tokenizer:
97///
98/// ```no_run
99/// # use lindera_sqlite::load_tokenizer;
100/// let tokenizer = load_tokenizer().expect("Failed to load tokenizer");
101/// ```
102#[inline]
103pub fn load_tokenizer() -> Result<Tokenizer, c_int> {
104    let builder = TokenizerBuilder::new().map_err(|e| {
105        eprintln!("Failed to create tokenizer builder: {e}");
106        SQLITE_INTERNAL
107    })?;
108    let tokenizer = builder.build().map_err(|e| {
109        eprintln!("Failed to create tokenizer: {e}");
110        SQLITE_INTERNAL
111    })?;
112
113    Ok(tokenizer)
114}
115
116/// C-compatible FTS5 tokenization function.
117///
118/// This is the main entry point called by SQLite's FTS5 extension to tokenize text.
119/// It follows the FTS5 tokenizer API specification and provides panic safety by catching
120/// any Rust panics that might occur during tokenization.
121///
122/// # Parameters
123///
124/// - `tokenizer` - Pointer to the [`Fts5Tokenizer`] instance
125/// - `p_ctx` - Context pointer passed to the token callback function
126/// - `_flags` - Tokenization flags (currently unused)
127/// - `p_text` - Pointer to the input text buffer (UTF-8 encoded)
128/// - `n_text` - Length of the input text in bytes
129/// - `x_token` - Callback function invoked for each token found
130///
131/// # Returns
132///
133/// - [`SQLITE_OK`] - Tokenization completed successfully
134/// - [`SQLITE_INTERNAL`] - An internal error occurred (including panics)
135/// - Other SQLite error codes propagated from the token callback
136///
137/// # Safety
138///
139/// This function is marked as `unsafe(no_mangle)` and `extern "C"` for FFI compatibility.
140/// It wraps the internal tokenization logic with panic catching to prevent unwinding
141/// across the FFI boundary, which would be undefined behavior.
142///
143/// The caller must ensure:
144/// - `tokenizer` points to a valid [`Fts5Tokenizer`] instance
145/// - `p_text` points to valid UTF-8 data of length `n_text`
146/// - `x_token` is a valid function pointer
147///
148/// # C API Example
149///
150/// ```c
151/// // Called by SQLite FTS5 when tokenizing text
152/// int rc = lindera_fts5_tokenize(
153///     tokenizer,
154///     context,
155///     0,
156///     "日本語テキスト",
157///     strlen("日本語テキスト"),
158///     my_token_callback
159/// );
160/// ```
161#[unsafe(no_mangle)]
162pub extern "C" fn lindera_fts5_tokenize(
163    tokenizer: *mut Fts5Tokenizer,
164    p_ctx: *mut c_void,
165    _flags: c_int,
166    p_text: *const c_char,
167    n_text: c_int,
168    x_token: TokenFunction,
169) -> c_int {
170    crate::common::ffi_panic_boundary(|| {
171        lindera_fts5_tokenize_internal(tokenizer, p_ctx, p_text, n_text, x_token)?;
172        Ok(())
173    })
174}
175
176/// Internal tokenization implementation.
177///
178/// Performs the actual tokenization of input text and invokes the callback function
179/// for each token produced. This function handles UTF-8 validation, tokenization,
180/// and error propagation.
181///
182/// # Parameters
183///
184/// - `tokenizer` - Pointer to the [`Fts5Tokenizer`] instance
185/// - `p_ctx` - Context pointer to pass to the token callback
186/// - `p_text` - Raw pointer to UTF-8 encoded text
187/// - `n_text` - Length of text in bytes
188/// - `x_token` - Callback function to invoke for each token
189///
190/// # Returns
191///
192/// - `Ok(())` - All tokens processed successfully
193/// - `Err(SQLITE_OK)` - Invalid UTF-8 input (treated as success to keep database accessible)
194/// - `Err(SQLITE_INTERNAL)` - Tokenization failed
195/// - `Err(code)` - Error code returned by the token callback
196///
197/// # Safety
198///
199/// This function performs unsafe operations:
200/// - Dereferences raw pointers (`tokenizer`, `p_text`)
201/// - Creates slices from raw pointer and length
202///
203/// The caller must ensure all pointers are valid and properly aligned.
204///
205/// # Error Handling
206///
207/// - **UTF-8 Errors**: Mapped to [`SQLITE_OK`] to prevent database inaccessibility
208/// - **Tokenization Errors**: Return [`SQLITE_INTERNAL`]
209/// - **Callback Errors**: Propagated immediately, stopping tokenization
210///
211/// # Token Callback Protocol
212///
213/// For each token, the callback is invoked with:
214/// - `p_ctx` - Context pointer (unchanged)
215/// - `0` - Flags (currently always 0)
216/// - Token surface as C string pointer
217/// - Token length in bytes
218/// - Byte offset of token start in original text
219/// - Byte offset of token end in original text
220#[inline]
221fn lindera_fts5_tokenize_internal(
222    tokenizer: *mut Fts5Tokenizer,
223    p_ctx: *mut c_void,
224    p_text: *const c_char,
225    n_text: c_int,
226    x_token: TokenFunction,
227) -> Result<(), c_int> {
228    if n_text <= 0 {
229        return Ok(());
230    }
231
232    let input = unsafe { InputText::from_raw_parts(p_text, n_text)? };
233    let mut tokenizer = unsafe { TokenizerHandle::new(tokenizer)? };
234    let callback = crate::common::TokenCallback::new(p_ctx, x_token);
235
236    tokenizer.emit_tokens(input.as_str(), &callback)
237}
238
239struct TokenizerHandle<'a> {
240    inner: &'a mut Fts5Tokenizer,
241}
242
243impl<'a> TokenizerHandle<'a> {
244    unsafe fn new(ptr: *mut Fts5Tokenizer) -> Result<Self, c_int> {
245        let inner = unsafe { ptr.as_mut() }.ok_or(SQLITE_INTERNAL)?;
246        Ok(Self { inner })
247    }
248
249    fn emit_tokens(
250        &mut self,
251        input: &str,
252        callback: &crate::common::TokenCallback,
253    ) -> Result<(), c_int> {
254        let tokens = self
255            .inner
256            .tokenizer
257            .tokenize(input)
258            .map_err(|_| SQLITE_INTERNAL)?;
259
260        for token in tokens {
261            callback.emit(token.surface.as_bytes(), token.byte_start, token.byte_end)?;
262        }
263
264        Ok(())
265    }
266}
267
268struct InputText<'a> {
269    text: &'a str,
270}
271
272impl<'a> InputText<'a> {
273    unsafe fn from_raw_parts(ptr: *const c_char, len: c_int) -> Result<Self, c_int> {
274        let slice = unsafe { core::slice::from_raw_parts(ptr as *const c_uchar, len as usize) };
275        let text = core::str::from_utf8(slice).map_err(|_| SQLITE_OK)?;
276        Ok(Self { text })
277    }
278
279    fn as_str(&self) -> &str {
280        self.text
281    }
282}
283
284#[cfg(test)]
285mod tests {
286    use super::*;
287
288    extern "C" fn token_callback(
289        ctx: *mut c_void,
290        flags: c_int,
291        token: *const c_char,
292        token_len: c_int,
293        start: c_int,
294        end: c_int,
295    ) -> c_int {
296        assert_eq!(flags, 0);
297
298        let tokens_ptr = ctx as *mut _ as *mut Vec<(String, c_int, c_int)>;
299        let tokens = unsafe { tokens_ptr.as_mut() }.expect("tokens pointer");
300        let slice =
301            unsafe { core::slice::from_raw_parts(token as *const c_uchar, token_len as usize) };
302        let token = String::from_utf8(slice.to_vec()).expect("Expected utf-8 token");
303
304        tokens.push((token, start, end));
305
306        return SQLITE_OK;
307    }
308
309    #[test]
310    fn it_emits_segments() {
311        let input = "Linderaは形態素解析エンジンです。ユーザー辞書も利用可能です。";
312        let mut tokens: Vec<(String, c_int, c_int)> = vec![];
313
314        let mut tokenizer = Fts5Tokenizer {
315            tokenizer: load_tokenizer().unwrap(),
316        };
317        lindera_fts5_tokenize_internal(
318            &mut tokenizer,
319            &mut tokens as *mut _ as *mut c_void,
320            input.as_bytes().as_ptr() as *const c_char,
321            input.len() as i32,
322            token_callback,
323        )
324        .expect("tokenize internal should not fail");
325
326        assert_eq!(
327            tokens,
328            [
329                ("Lindera", 0, 21),
330                ("形態素", 24, 33),
331                ("解析", 33, 39),
332                ("エンジン", 39, 54),
333                ("ユーザ", 63, 75),
334                ("辞書", 75, 81),
335                ("利用", 84, 90),
336                ("可能", 90, 96)
337            ]
338            .map(|(s, start, end)| (s.to_owned(), start, end))
339        );
340    }
341
342    #[test]
343    fn it_ignores_invalid_utf8() {
344        let input = b"\xc3\x28";
345        let mut tokens: Vec<(String, c_int, c_int)> = vec![];
346
347        let mut tokenizer = Fts5Tokenizer {
348            tokenizer: load_tokenizer().unwrap(),
349        };
350        assert_eq!(
351            lindera_fts5_tokenize_internal(
352                &mut tokenizer,
353                &mut tokens as *mut _ as *mut c_void,
354                input.as_ptr() as *const c_char,
355                input.len() as i32,
356                token_callback,
357            )
358            .expect_err("tokenize internal should not fail"),
359            SQLITE_OK
360        );
361
362        assert_eq!(tokens, []);
363    }
364}