extern crate alloc;
mod common;
#[cfg(feature = "extension")]
mod extension;
use libc::{c_char, c_int, c_uchar, c_void};
use lindera::tokenizer::{Tokenizer, TokenizerBuilder};
pub use crate::common::*;
#[inline]
pub fn load_tokenizer() -> Result<Tokenizer, c_int> {
let builder = TokenizerBuilder::new().map_err(|e| {
eprintln!("Failed to create tokenizer builder: {e}");
SQLITE_INTERNAL
})?;
let tokenizer = builder.build().map_err(|e| {
eprintln!("Failed to create tokenizer: {e}");
SQLITE_INTERNAL
})?;
Ok(tokenizer)
}
#[unsafe(no_mangle)]
pub extern "C" fn lindera_fts5_tokenize(
tokenizer: *mut Fts5Tokenizer,
p_ctx: *mut c_void,
_flags: c_int,
p_text: *const c_char,
n_text: c_int,
x_token: TokenFunction,
) -> c_int {
crate::common::ffi_panic_boundary(|| {
lindera_fts5_tokenize_internal(tokenizer, p_ctx, p_text, n_text, x_token)?;
Ok(())
})
}
#[inline]
fn lindera_fts5_tokenize_internal(
tokenizer: *mut Fts5Tokenizer,
p_ctx: *mut c_void,
p_text: *const c_char,
n_text: c_int,
x_token: TokenFunction,
) -> Result<(), c_int> {
if n_text <= 0 {
return Ok(());
}
let input = unsafe { InputText::from_raw_parts(p_text, n_text)? };
let mut tokenizer = unsafe { TokenizerHandle::new(tokenizer)? };
let callback = crate::common::TokenCallback::new(p_ctx, x_token);
tokenizer.emit_tokens(input.as_str(), &callback)
}
struct TokenizerHandle<'a> {
inner: &'a mut Fts5Tokenizer,
}
impl<'a> TokenizerHandle<'a> {
unsafe fn new(ptr: *mut Fts5Tokenizer) -> Result<Self, c_int> {
let inner = unsafe { ptr.as_mut() }.ok_or(SQLITE_INTERNAL)?;
Ok(Self { inner })
}
fn emit_tokens(
&mut self,
input: &str,
callback: &crate::common::TokenCallback,
) -> Result<(), c_int> {
let tokens = self
.inner
.tokenizer
.tokenize(input)
.map_err(|_| SQLITE_INTERNAL)?;
for token in tokens {
callback.emit(token.surface.as_bytes(), token.byte_start, token.byte_end)?;
}
Ok(())
}
}
struct InputText<'a> {
text: &'a str,
}
impl<'a> InputText<'a> {
unsafe fn from_raw_parts(ptr: *const c_char, len: c_int) -> Result<Self, c_int> {
let slice = unsafe { core::slice::from_raw_parts(ptr as *const c_uchar, len as usize) };
let text = core::str::from_utf8(slice).map_err(|_| SQLITE_OK)?;
Ok(Self { text })
}
fn as_str(&self) -> &str {
self.text
}
}
#[cfg(test)]
mod tests {
use super::*;
extern "C" fn token_callback(
ctx: *mut c_void,
flags: c_int,
token: *const c_char,
token_len: c_int,
start: c_int,
end: c_int,
) -> c_int {
assert_eq!(flags, 0);
let tokens_ptr = ctx as *mut _ as *mut Vec<(String, c_int, c_int)>;
let tokens = unsafe { tokens_ptr.as_mut() }.expect("tokens pointer");
let slice =
unsafe { core::slice::from_raw_parts(token as *const c_uchar, token_len as usize) };
let token = String::from_utf8(slice.to_vec()).expect("Expected utf-8 token");
tokens.push((token, start, end));
return SQLITE_OK;
}
#[test]
fn it_emits_segments() {
let input = "Linderaは形態素解析エンジンです。ユーザー辞書も利用可能です。";
let mut tokens: Vec<(String, c_int, c_int)> = vec![];
let mut tokenizer = Fts5Tokenizer {
tokenizer: load_tokenizer().unwrap(),
};
lindera_fts5_tokenize_internal(
&mut tokenizer,
&mut tokens as *mut _ as *mut c_void,
input.as_bytes().as_ptr() as *const c_char,
input.len() as i32,
token_callback,
)
.expect("tokenize internal should not fail");
assert_eq!(
tokens,
[
("Lindera", 0, 21),
("形態素", 24, 33),
("解析", 33, 39),
("エンジン", 39, 54),
("ユーザ", 63, 75),
("辞書", 75, 81),
("利用", 84, 90),
("可能", 90, 96)
]
.map(|(s, start, end)| (s.to_owned(), start, end))
);
}
#[test]
fn it_ignores_invalid_utf8() {
let input = b"\xc3\x28";
let mut tokens: Vec<(String, c_int, c_int)> = vec![];
let mut tokenizer = Fts5Tokenizer {
tokenizer: load_tokenizer().unwrap(),
};
assert_eq!(
lindera_fts5_tokenize_internal(
&mut tokenizer,
&mut tokens as *mut _ as *mut c_void,
input.as_ptr() as *const c_char,
input.len() as i32,
token_callback,
)
.expect_err("tokenize internal should not fail"),
SQLITE_OK
);
assert_eq!(tokens, []);
}
}