lindera_sqlite/
lib.rs

1//! # lindera-sqlite
2//!
3//! A SQLite FTS5 (Full-Text Search 5) tokenizer extension that provides support for
4//! Chinese, Japanese, and Korean (CJK) text analysis using the Lindera morphological analyzer.
5//!
6//! ## Features
7//!
8//! - **CJK Language Support**: Tokenizes Chinese, Japanese, and Korean text using Lindera
9//! - **Multiple Dictionaries**: Supports various embedded dictionaries (IPADIC, UniDic, ko-dic, CC-CEDICT)
10//! - **Configurable**: Uses YAML configuration for character filters and token filters
11//! - **SQLite Integration**: Seamlessly integrates with SQLite's FTS5 full-text search
12//!
13//! ## Usage
14//!
15//! ### Building the Extension
16//!
17//! ```bash
18//! cargo build --release --features=embedded-cjk
19//! ```
20//!
21//! ### Setting Up Configuration
22//!
23//! Set the `LINDERA_CONFIG_PATH` environment variable to point to your Lindera configuration file:
24//!
25//! ```bash
26//! export LINDERA_CONFIG_PATH=./resources/lindera.yml
27//! ```
28//!
29//! ### Loading in SQLite
30//!
31//! ```sql
32//! .load ./target/release/liblindera_sqlite lindera_fts5_tokenizer_init
33//! ```
34//!
35//! ### Creating an FTS5 Table
36//!
37//! ```sql
38//! CREATE VIRTUAL TABLE example USING fts5(content, tokenize='lindera_tokenizer');
39//! ```
40//!
41//! ### Searching
42//!
43//! ```sql
44//! INSERT INTO example(content) VALUES ('日本語の全文検索');
45//! SELECT * FROM example WHERE content MATCH '検索';
46//! ```
47//!
48//! ## Architecture
49//!
50//! This library provides a C ABI interface for SQLite to use Lindera as a custom FTS5 tokenizer.
51//! The main components are:
52//!
53//! - [`load_tokenizer`]: Initializes a Lindera tokenizer with configuration
54//! - [`lindera_fts5_tokenize`]: C-compatible entry point for tokenization (called by SQLite)
55//! - Internal tokenization logic that converts text to tokens and calls back to SQLite
56
57extern crate alloc;
58
59mod common;
60#[cfg(feature = "extension")]
61mod extension;
62
63use libc::{c_char, c_int, c_uchar, c_void};
64
65use lindera::tokenizer::{Tokenizer, TokenizerBuilder};
66
67pub use crate::common::*;
68
69/// Loads and initializes a Lindera tokenizer.
70///
71/// This function creates a new Lindera tokenizer using the configuration specified
72/// by the `LINDERA_CONFIG_PATH` environment variable. The configuration file controls
73/// segmentation mode, character filters, and token filters.
74///
75/// # Returns
76///
77/// - `Ok(Tokenizer)` - Successfully initialized tokenizer
78/// - `Err(c_int)` - Returns [`SQLITE_INTERNAL`] if tokenizer creation fails
79///
80/// # Errors
81///
82/// This function will return an error if:
83/// - The tokenizer builder cannot be created (e.g., missing or invalid configuration)
84/// - The tokenizer cannot be built from the builder
85///
86/// Error messages are written to stderr for debugging purposes.
87///
88/// # Examples
89///
90/// Set the configuration path environment variable before loading:
91///
92/// ```bash
93/// export LINDERA_CONFIG_PATH=./resources/lindera.yml
94/// ```
95///
96/// Then load the tokenizer:
97///
98/// ```no_run
99/// # use lindera_sqlite::load_tokenizer;
100/// let tokenizer = load_tokenizer().expect("Failed to load tokenizer");
101/// ```
102#[inline]
103pub fn load_tokenizer() -> Result<Tokenizer, c_int> {
104    let builder = TokenizerBuilder::new().map_err(|e| {
105        eprintln!("Failed to create tokenizer builder: {e}");
106        SQLITE_INTERNAL
107    })?;
108    let tokenizer = builder.build().map_err(|e| {
109        eprintln!("Failed to create tokenizer: {e}");
110        SQLITE_INTERNAL
111    })?;
112
113    Ok(tokenizer)
114}
115
116/// C-compatible FTS5 tokenization function.
117///
118/// This is the main entry point called by SQLite's FTS5 extension to tokenize text.
119/// It follows the FTS5 tokenizer API specification and provides panic safety by catching
120/// any Rust panics that might occur during tokenization.
121///
122/// # Parameters
123///
124/// - `tokenizer` - Pointer to the [`Fts5Tokenizer`] instance
125/// - `p_ctx` - Context pointer passed to the token callback function
126/// - `_flags` - Tokenization flags (currently unused)
127/// - `p_text` - Pointer to the input text buffer (UTF-8 encoded)
128/// - `n_text` - Length of the input text in bytes
129/// - `x_token` - Callback function invoked for each token found
130///
131/// # Returns
132///
133/// - [`SQLITE_OK`] - Tokenization completed successfully
134/// - [`SQLITE_INTERNAL`] - An internal error occurred (including panics)
135/// - Other SQLite error codes propagated from the token callback
136///
137/// # Safety
138///
139/// This function is marked as `unsafe(no_mangle)` and `extern "C"` for FFI compatibility.
140/// It wraps the internal tokenization logic with panic catching to prevent unwinding
141/// across the FFI boundary, which would be undefined behavior.
142///
143/// The caller must ensure:
144/// - `tokenizer` points to a valid [`Fts5Tokenizer`] instance
145/// - `p_text` points to valid UTF-8 data of length `n_text`
146/// - `x_token` is a valid function pointer
147///
148/// # C API Example
149///
150/// ```c
151/// // Called by SQLite FTS5 when tokenizing text
152/// int rc = lindera_fts5_tokenize(
153///     tokenizer,
154///     context,
155///     0,
156///     "日本語テキスト",
157///     strlen("日本語テキスト"),
158///     my_token_callback
159/// );
160/// ```
161#[unsafe(no_mangle)]
162pub extern "C" fn lindera_fts5_tokenize(
163    tokenizer: *mut Fts5Tokenizer,
164    p_ctx: *mut c_void,
165    _flags: c_int,
166    p_text: *const c_char,
167    n_text: c_int,
168    x_token: TokenFunction,
169) -> c_int {
170    std::panic::catch_unwind(std::panic::AssertUnwindSafe(
171        || match lindera_fts5_tokenize_internal(tokenizer, p_ctx, p_text, n_text, x_token) {
172            Ok(()) => SQLITE_OK,
173            Err(code) => code,
174        },
175    ))
176    .unwrap_or(SQLITE_INTERNAL)
177}
178
179/// Internal tokenization implementation.
180///
181/// Performs the actual tokenization of input text and invokes the callback function
182/// for each token produced. This function handles UTF-8 validation, tokenization,
183/// and error propagation.
184///
185/// # Parameters
186///
187/// - `tokenizer` - Pointer to the [`Fts5Tokenizer`] instance
188/// - `p_ctx` - Context pointer to pass to the token callback
189/// - `p_text` - Raw pointer to UTF-8 encoded text
190/// - `n_text` - Length of text in bytes
191/// - `x_token` - Callback function to invoke for each token
192///
193/// # Returns
194///
195/// - `Ok(())` - All tokens processed successfully
196/// - `Err(SQLITE_OK)` - Invalid UTF-8 input (treated as success to keep database accessible)
197/// - `Err(SQLITE_INTERNAL)` - Tokenization failed
198/// - `Err(code)` - Error code returned by the token callback
199///
200/// # Safety
201///
202/// This function performs unsafe operations:
203/// - Dereferences raw pointers (`tokenizer`, `p_text`)
204/// - Creates slices from raw pointer and length
205///
206/// The caller must ensure all pointers are valid and properly aligned.
207///
208/// # Error Handling
209///
210/// - **UTF-8 Errors**: Mapped to [`SQLITE_OK`] to prevent database inaccessibility
211/// - **Tokenization Errors**: Return [`SQLITE_INTERNAL`]
212/// - **Callback Errors**: Propagated immediately, stopping tokenization
213///
214/// # Token Callback Protocol
215///
216/// For each token, the callback is invoked with:
217/// - `p_ctx` - Context pointer (unchanged)
218/// - `0` - Flags (currently always 0)
219/// - Token surface as C string pointer
220/// - Token length in bytes
221/// - Byte offset of token start in original text
222/// - Byte offset of token end in original text
223#[inline]
224fn lindera_fts5_tokenize_internal(
225    tokenizer: *mut Fts5Tokenizer,
226    p_ctx: *mut c_void,
227    p_text: *const c_char,
228    n_text: c_int,
229    x_token: TokenFunction,
230) -> Result<(), c_int> {
231    if n_text <= 0 {
232        return Ok(());
233    }
234
235    let slice = unsafe { core::slice::from_raw_parts(p_text as *const c_uchar, n_text as usize) };
236
237    // Map errors to SQLITE_OK because failing here means that the database
238    // wouldn't accessible.
239    let input = core::str::from_utf8(slice).map_err(|_| SQLITE_OK)?;
240
241    match unsafe { (*tokenizer).tokenizer.tokenize(input) } {
242        Ok(tokens) => {
243            for token in tokens {
244                let rc = x_token(
245                    p_ctx,
246                    0,
247                    token.surface.as_bytes().as_ptr() as *const c_char,
248                    token.surface.len() as c_int,
249                    token.byte_start as c_int,
250                    token.byte_end as c_int,
251                );
252                if rc != SQLITE_OK {
253                    return Err(rc);
254                }
255            }
256        }
257        Err(_) => {
258            return Err(SQLITE_INTERNAL);
259        }
260    }
261
262    Ok(())
263}
264
265#[cfg(test)]
266mod tests {
267    use super::*;
268
269    extern "C" fn token_callback(
270        ctx: *mut c_void,
271        flags: c_int,
272        token: *const c_char,
273        token_len: c_int,
274        start: c_int,
275        end: c_int,
276    ) -> c_int {
277        assert_eq!(flags, 0);
278
279        let tokens_ptr = ctx as *mut _ as *mut Vec<(String, c_int, c_int)>;
280        let tokens = unsafe { tokens_ptr.as_mut() }.expect("tokens pointer");
281        let slice =
282            unsafe { core::slice::from_raw_parts(token as *const c_uchar, token_len as usize) };
283        let token = String::from_utf8(slice.to_vec()).expect("Expected utf-8 token");
284
285        tokens.push((token, start, end));
286
287        return SQLITE_OK;
288    }
289
290    #[test]
291    fn it_emits_segments() {
292        let input = "Linderaは形態素解析エンジンです。ユーザー辞書も利用可能です。";
293        let mut tokens: Vec<(String, c_int, c_int)> = vec![];
294
295        let mut tokenizer = Fts5Tokenizer {
296            tokenizer: load_tokenizer().unwrap(),
297        };
298        lindera_fts5_tokenize_internal(
299            &mut tokenizer,
300            &mut tokens as *mut _ as *mut c_void,
301            input.as_bytes().as_ptr() as *const c_char,
302            input.len() as i32,
303            token_callback,
304        )
305        .expect("tokenize internal should not fail");
306
307        assert_eq!(
308            tokens,
309            [
310                ("Lindera", 0, 21),
311                ("形態素", 24, 33),
312                ("解析", 33, 39),
313                ("エンジン", 39, 54),
314                ("ユーザ", 63, 75),
315                ("辞書", 75, 81),
316                ("利用", 84, 90),
317                ("可能", 90, 96)
318            ]
319            .map(|(s, start, end)| (s.to_owned(), start, end))
320        );
321    }
322
323    #[test]
324    fn it_ignores_invalid_utf8() {
325        let input = b"\xc3\x28";
326        let mut tokens: Vec<(String, c_int, c_int)> = vec![];
327
328        let mut tokenizer = Fts5Tokenizer {
329            tokenizer: load_tokenizer().unwrap(),
330        };
331        assert_eq!(
332            lindera_fts5_tokenize_internal(
333                &mut tokenizer,
334                &mut tokens as *mut _ as *mut c_void,
335                input.as_ptr() as *const c_char,
336                input.len() as i32,
337                token_callback,
338            )
339            .expect_err("tokenize internal should not fail"),
340            SQLITE_OK
341        );
342
343        assert_eq!(tokens, []);
344    }
345}