Skip to main content

vldb_sqlite/
fts.rs

1use crate::tokenizer::{TokenizerMode, ensure_jieba_tokenizer_registered, tokenize_text};
2use rusqlite::{Connection, params};
3use serde::{Deserialize, Serialize};
4
5/// FTS 索引元信息返回(中英双语)。
6/// FTS index metadata response (bilingual).
7#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
8pub struct EnsureFtsIndexResult {
9    /// 操作是否成功。
10    /// Whether the operation succeeded.
11    pub success: bool,
12    /// 返回消息。
13    /// Human readable response message.
14    pub message: String,
15    /// 最终使用的索引名。
16    /// Effective sanitized index name.
17    pub index_name: String,
18    /// 最终使用的分词模式。
19    /// Effective tokenizer mode.
20    pub tokenizer_mode: String,
21}
22
23/// FTS 重建结果(中英双语)。
24/// FTS rebuild result payload (bilingual).
25#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
26pub struct RebuildFtsIndexResult {
27    /// 操作是否成功。
28    /// Whether the operation succeeded.
29    pub success: bool,
30    /// 返回消息。
31    /// Human readable response message.
32    pub message: String,
33    /// 索引名。
34    /// Effective index name.
35    pub index_name: String,
36    /// 使用的分词模式。
37    /// Effective tokenizer mode.
38    pub tokenizer_mode: String,
39    /// 重建时重新写回的文档数。
40    /// Number of documents reindexed during rebuild.
41    pub reindexed_rows: u64,
42}
43
44/// FTS 文档变更结果(中英双语)。
45/// FTS document mutation result (bilingual).
46#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
47pub struct FtsMutationResult {
48    /// 操作是否成功。
49    /// Whether the operation succeeded.
50    pub success: bool,
51    /// 返回消息。
52    /// Human readable response message.
53    pub message: String,
54    /// 受影响行数。
55    /// Number of affected rows.
56    pub affected_rows: u64,
57    /// 索引名。
58    /// Index name.
59    pub index_name: String,
60}
61
62/// FTS 命中文档(中英双语)。
63/// FTS hit payload (bilingual).
64#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
65pub struct SearchFtsHit {
66    /// 业务 ID。
67    /// Business identifier.
68    pub id: String,
69    /// 文件路径或逻辑路径。
70    /// File path or logical path.
71    pub file_path: String,
72    /// 标题。
73    /// Title field.
74    pub title: String,
75    /// 带命中高亮的标题。
76    /// Highlighted title text.
77    pub title_highlight: String,
78    /// 带上下文片段的正文摘要。
79    /// Content snippet with query highlights.
80    pub content_snippet: String,
81    /// 标准化分数,统一约定为“越大越好”。
82    /// Normalized score, always “higher is better”.
83    pub score: f64,
84    /// 当前结果中的排序名次。
85    /// Rank inside the current result set.
86    pub rank: u64,
87    /// SQLite `bm25()` 原始分值。
88    /// Raw SQLite `bm25()` score.
89    pub raw_score: f64,
90}
91
92/// FTS 检索结果(中英双语)。
93/// FTS search response payload (bilingual).
94#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
95pub struct SearchFtsResult {
96    /// 操作是否成功。
97    /// Whether the operation succeeded.
98    pub success: bool,
99    /// 返回消息。
100    /// Human readable response message.
101    pub message: String,
102    /// 索引名。
103    /// Index name.
104    pub index_name: String,
105    /// 使用的分词模式。
106    /// Effective tokenizer mode.
107    pub tokenizer_mode: String,
108    /// 规范化后的查询文本。
109    /// Normalized query text.
110    pub normalized_query: String,
111    /// 最终传给 SQLite MATCH 的表达式。
112    /// Final FTS MATCH expression passed into SQLite.
113    pub fts_query: String,
114    /// 检索结果来源标识,供混合检索层识别。
115    /// Result source label for hybrid retrieval layers.
116    pub source: String,
117    /// 查询模式标识,便于上层做统一调度。
118    /// Query mode label for upper-layer orchestration.
119    pub query_mode: String,
120    /// 命中总数。
121    /// Total number of hits.
122    pub total: u64,
123    /// 命中列表。
124    /// Search hit list.
125    pub hits: Vec<SearchFtsHit>,
126}
127
128/// 确保某个 FTS 索引存在(中英双语)。
129/// Ensure an FTS index exists for the requested logical name (bilingual).
130pub fn ensure_fts_index(
131    connection: &Connection,
132    index_name: &str,
133    tokenizer_mode: TokenizerMode,
134) -> rusqlite::Result<EnsureFtsIndexResult> {
135    if tokenizer_mode == TokenizerMode::Jieba {
136        ensure_jieba_tokenizer_registered(connection)?;
137    }
138
139    let index_name = sanitize_index_name(index_name)?;
140    let quoted_index_name = quote_identifier(&index_name);
141    let tokenizer_sql = tokenizer_sql(tokenizer_mode);
142
143    connection.execute_batch(&format!(
144        "CREATE VIRTUAL TABLE IF NOT EXISTS {index_name} USING fts5(
145            id UNINDEXED,
146            file_path UNINDEXED,
147            title,
148            content,
149            tokenize={tokenizer_sql}
150        );",
151        index_name = quoted_index_name,
152        tokenizer_sql = tokenizer_sql,
153    ))?;
154
155    Ok(EnsureFtsIndexResult {
156        success: true,
157        message: "fts index ensured / FTS 索引已确认存在".to_string(),
158        index_name,
159        tokenizer_mode: tokenizer_mode.as_str().to_string(),
160    })
161}
162
163/// 重建某个 FTS 索引,使已有文档重新吃到新的分词与词典策略(中英双语)。
164/// Rebuild an FTS index so existing rows pick up new tokenizer and dictionary behavior (bilingual).
165pub fn rebuild_fts_index(
166    connection: &Connection,
167    index_name: &str,
168    tokenizer_mode: TokenizerMode,
169) -> rusqlite::Result<RebuildFtsIndexResult> {
170    if tokenizer_mode == TokenizerMode::Jieba {
171        ensure_jieba_tokenizer_registered(connection)?;
172    }
173
174    let index_name = sanitize_index_name(index_name)?;
175    let quoted_index_name = quote_identifier(&index_name);
176    let exists: i64 = connection.query_row(
177        "SELECT COUNT(*) FROM sqlite_master WHERE type = 'table' AND name = ?1",
178        params![index_name.as_str()],
179        |row| row.get(0),
180    )?;
181
182    if exists == 0 {
183        let ensured = ensure_fts_index(connection, index_name.as_str(), tokenizer_mode)?;
184        return Ok(RebuildFtsIndexResult {
185            success: true,
186            message: "fts index created during rebuild / FTS 索引在重建过程中已创建".to_string(),
187            index_name: ensured.index_name,
188            tokenizer_mode: ensured.tokenizer_mode,
189            reindexed_rows: 0,
190        });
191    }
192
193    let mut statement = connection.prepare(&format!(
194        "SELECT id, file_path, title, content FROM {index_name} ORDER BY rowid ASC",
195        index_name = quoted_index_name
196    ))?;
197    let mut rows = statement.query([])?;
198    let mut documents = Vec::new();
199    while let Some(row) = rows.next()? {
200        documents.push((
201            row.get::<_, Option<String>>(0)?.unwrap_or_default(),
202            row.get::<_, Option<String>>(1)?.unwrap_or_default(),
203            row.get::<_, Option<String>>(2)?.unwrap_or_default(),
204            row.get::<_, Option<String>>(3)?.unwrap_or_default(),
205        ));
206    }
207    drop(rows);
208    drop(statement);
209
210    connection.execute_batch("BEGIN IMMEDIATE TRANSACTION;")?;
211    let rebuild_result = (|| -> rusqlite::Result<RebuildFtsIndexResult> {
212        connection.execute_batch(&format!(
213            "DROP TABLE IF EXISTS {index_name};",
214            index_name = quoted_index_name
215        ))?;
216        let ensured = ensure_fts_index(connection, index_name.as_str(), tokenizer_mode)?;
217        let mut reindexed_rows = 0_u64;
218        for (id, file_path, title, content) in documents {
219            upsert_fts_document(
220                connection,
221                ensured.index_name.as_str(),
222                tokenizer_mode,
223                id.as_str(),
224                file_path.as_str(),
225                title.as_str(),
226                content.as_str(),
227            )?;
228            reindexed_rows += 1;
229        }
230        Ok(RebuildFtsIndexResult {
231            success: true,
232            message: format!(
233                "fts index rebuilt (rows={}) / FTS 索引已重建",
234                reindexed_rows
235            ),
236            index_name: ensured.index_name,
237            tokenizer_mode: ensured.tokenizer_mode,
238            reindexed_rows,
239        })
240    })();
241
242    match rebuild_result {
243        Ok(result) => {
244            connection.execute_batch("COMMIT;")?;
245            Ok(result)
246        }
247        Err(error) => {
248            let _ = connection.execute_batch("ROLLBACK;");
249            Err(error)
250        }
251    }
252}
253
254/// 写入或更新 FTS 文档(中英双语)。
255/// Insert or update an FTS document (bilingual).
256pub fn upsert_fts_document(
257    connection: &Connection,
258    index_name: &str,
259    tokenizer_mode: TokenizerMode,
260    id: &str,
261    file_path: &str,
262    title: &str,
263    content: &str,
264) -> rusqlite::Result<FtsMutationResult> {
265    let ensured = ensure_fts_index(connection, index_name, tokenizer_mode)?;
266    let quoted_index_name = quote_identifier(&ensured.index_name);
267
268    let mut affected_rows = 0_u64;
269    affected_rows += connection.execute(
270        &format!("DELETE FROM {index_name} WHERE id = ?1", index_name = quoted_index_name),
271        params![id],
272    )? as u64;
273    affected_rows += connection.execute(
274        &format!(
275            "INSERT INTO {index_name} (id, file_path, title, content) VALUES (?1, ?2, ?3, ?4)",
276            index_name = quoted_index_name
277        ),
278        params![id, file_path, title, content],
279    )? as u64;
280
281    Ok(FtsMutationResult {
282        success: true,
283        message: "fts document upserted / FTS 文档已写入".to_string(),
284        affected_rows,
285        index_name: ensured.index_name,
286    })
287}
288
289/// 删除 FTS 文档(中英双语)。
290/// Delete an FTS document by business id (bilingual).
291pub fn delete_fts_document(
292    connection: &Connection,
293    index_name: &str,
294    id: &str,
295) -> rusqlite::Result<FtsMutationResult> {
296    let index_name = sanitize_index_name(index_name)?;
297    let quoted_index_name = quote_identifier(&index_name);
298    let affected_rows = connection.execute(
299        &format!("DELETE FROM {index_name} WHERE id = ?1", index_name = quoted_index_name),
300        params![id],
301    )? as u64;
302
303    Ok(FtsMutationResult {
304        success: true,
305        message: if affected_rows > 0 {
306            "fts document removed / FTS 文档已删除".to_string()
307        } else {
308            "fts document not found / FTS 文档不存在".to_string()
309        },
310        affected_rows,
311        index_name,
312    })
313}
314
315/// 执行标准化的 FTS 检索(中英双语)。
316/// Execute normalized FTS search with RRF-friendly fields (bilingual).
317pub fn search_fts(
318    connection: &Connection,
319    index_name: &str,
320    tokenizer_mode: TokenizerMode,
321    query: &str,
322    limit: u32,
323    offset: u32,
324) -> rusqlite::Result<SearchFtsResult> {
325    let ensured = ensure_fts_index(connection, index_name, tokenizer_mode)?;
326    let tokenized_query = tokenize_text(Some(connection), tokenizer_mode, query, true)?;
327    let quoted_index_name = quote_identifier(&ensured.index_name);
328    let effective_limit = limit.clamp(1, 200);
329
330    let total: u64 = connection.query_row(
331        &format!(
332            "SELECT COUNT(*) FROM {index_name} WHERE {index_name} MATCH ?1",
333            index_name = quoted_index_name,
334        ),
335        params![tokenized_query.fts_query.as_str()],
336        |row| row.get::<_, i64>(0),
337    )? as u64;
338
339    let mut statement = connection.prepare(&format!(
340        "SELECT
341            id,
342            file_path,
343            title,
344            highlight({index_name}, 2, '<mark>', '</mark>') AS title_highlight,
345            snippet({index_name}, 3, '<mark>', '</mark>', '...', 12) AS content_snippet,
346            bm25({index_name}, 2.0, 1.0) AS raw_score
347         FROM {index_name}
348         WHERE {index_name} MATCH ?1
349         ORDER BY raw_score ASC, file_path ASC, id ASC
350         LIMIT ?2 OFFSET ?3",
351        index_name = quoted_index_name,
352    ))?;
353
354    let mut rows = statement.query(params![
355        tokenized_query.fts_query.as_str(),
356        effective_limit as i64,
357        offset as i64
358    ])?;
359    let mut hits = Vec::new();
360    let mut rank = offset as u64 + 1;
361    while let Some(row) = rows.next()? {
362        let raw_score = row.get::<_, f64>(5)?;
363        hits.push(SearchFtsHit {
364            id: row.get(0)?,
365            file_path: row.get(1)?,
366            title: row.get::<_, Option<String>>(2)?.unwrap_or_default(),
367            title_highlight: row.get::<_, Option<String>>(3)?.unwrap_or_default(),
368            content_snippet: row.get::<_, Option<String>>(4)?.unwrap_or_default(),
369            score: -raw_score,
370            rank,
371            raw_score,
372        });
373        rank += 1;
374    }
375
376    Ok(SearchFtsResult {
377        success: true,
378        message: format!("fts search completed (hits={}) / FTS 检索完成", hits.len()),
379        index_name: ensured.index_name,
380        tokenizer_mode: ensured.tokenizer_mode,
381        normalized_query: tokenized_query.normalized_text,
382        fts_query: tokenized_query.fts_query,
383        source: "sqlite_fts".to_string(),
384        query_mode: "fts".to_string(),
385        total,
386        hits,
387    })
388}
389
390/// 校验并规范化索引名(中英双语)。
391/// Validate and normalize an FTS index name (bilingual).
392fn sanitize_index_name(index_name: &str) -> rusqlite::Result<String> {
393    let trimmed = index_name.trim();
394    if trimmed.is_empty() {
395        return Err(rusqlite::Error::InvalidParameterName(
396            "index_name must not be empty / index_name 不能为空".to_string(),
397        ));
398    }
399
400    let mut chars = trimmed.chars();
401    let Some(first) = chars.next() else {
402        return Err(rusqlite::Error::InvalidParameterName(
403            "index_name must not be empty / index_name 不能为空".to_string(),
404        ));
405    };
406    if !(first.is_ascii_alphabetic() || first == '_') {
407        return Err(rusqlite::Error::InvalidParameterName(
408            "index_name must start with [A-Za-z_] / index_name 必须以字母或下划线开头".to_string(),
409        ));
410    }
411    if !trimmed
412        .chars()
413        .all(|ch| ch.is_ascii_alphanumeric() || ch == '_')
414    {
415        return Err(rusqlite::Error::InvalidParameterName(
416            "index_name only supports [A-Za-z0-9_] / index_name 仅支持字母数字下划线".to_string(),
417        ));
418    }
419    if trimmed.starts_with("_vulcan_") {
420        return Err(rusqlite::Error::InvalidParameterName(
421            "reserved index_name prefix / 保留索引名前缀".to_string(),
422        ));
423    }
424
425    Ok(trimmed.to_string())
426}
427
428/// 为 SQLite 标识符加引号(中英双语)。
429/// Quote a validated SQLite identifier (bilingual).
430fn quote_identifier(identifier: &str) -> String {
431    format!("\"{}\"", identifier.replace('"', "\"\""))
432}
433
434/// 生成 FTS5 的 tokenizer SQL 片段(中英双语)。
435/// Build the FTS5 tokenizer SQL fragment (bilingual).
436fn tokenizer_sql(tokenizer_mode: TokenizerMode) -> &'static str {
437    match tokenizer_mode {
438        TokenizerMode::None => "'unicode61 remove_diacritics 2'",
439        TokenizerMode::Jieba => "'jieba'",
440    }
441}
442
443#[cfg(test)]
444mod tests {
445    use super::*;
446
447    /// 验证 FTS 索引建表、写入与检索的最小闭环(中英双语)。
448    /// Verify the minimal end-to-end FTS flow: ensure index, upsert, then search (bilingual).
449    #[test]
450    fn ensure_upsert_and_search_fts() -> rusqlite::Result<()> {
451        let connection = Connection::open_in_memory()?;
452        let ensured = ensure_fts_index(&connection, "memory_docs", TokenizerMode::Jieba)?;
453        assert!(ensured.success);
454        assert_eq!(ensured.index_name, "memory_docs");
455
456        upsert_fts_document(
457            &connection,
458            "memory_docs",
459            TokenizerMode::Jieba,
460            "doc-1",
461            "/demo/file.md",
462            "测试标题",
463            "市民田-女士急匆匆",
464        )?;
465        let _ = crate::tokenizer::upsert_custom_word(&connection, "田-女士", 42)?;
466        upsert_fts_document(
467            &connection,
468            "memory_docs",
469            TokenizerMode::Jieba,
470            "doc-1",
471            "/demo/file.md",
472            "测试标题",
473            "市民田-女士急匆匆",
474        )?;
475
476        let result = search_fts(
477            &connection,
478            "memory_docs",
479            TokenizerMode::Jieba,
480            "田-女士",
481            10,
482            0,
483        )?;
484        assert!(result.success);
485        assert_eq!(result.total, 1);
486        assert_eq!(result.hits.len(), 1);
487        assert_eq!(result.hits[0].id, "doc-1");
488        assert_eq!(result.hits[0].file_path, "/demo/file.md");
489        assert_eq!(result.hits[0].rank, 1);
490        assert!(result.hits[0].content_snippet.contains("mark"));
491        assert_eq!(result.source, "sqlite_fts");
492        assert_eq!(result.query_mode, "fts");
493
494        Ok(())
495    }
496
497    /// 验证词典热更新后可通过重建索引让旧文档重新吃到新分词策略(中英双语)。
498    /// Verify index rebuild applies updated dictionary behavior to previously indexed documents (bilingual).
499    #[test]
500    fn rebuild_fts_index_reindexes_existing_documents() -> rusqlite::Result<()> {
501        let connection = Connection::open_in_memory()?;
502        ensure_fts_index(&connection, "memory_docs", TokenizerMode::Jieba)?;
503        upsert_fts_document(
504            &connection,
505            "memory_docs",
506            TokenizerMode::Jieba,
507            "doc-1",
508            "/demo/file.md",
509            "测试标题",
510            "市民田-女士急匆匆",
511        )?;
512
513        connection.execute_batch(
514            "CREATE VIRTUAL TABLE IF NOT EXISTS memory_docs_vocab USING fts5vocab(
515                memory_docs,
516                'instance'
517            );",
518        )?;
519        let before_count: i64 = connection.query_row(
520            "SELECT count(*) FROM memory_docs_vocab WHERE term = ?1",
521            params!["田-女士"],
522            |row| row.get(0),
523        )?;
524        assert_eq!(before_count, 0);
525
526        crate::tokenizer::upsert_custom_word(&connection, "田-女士", 42)?;
527        let rebuild = rebuild_fts_index(&connection, "memory_docs", TokenizerMode::Jieba)?;
528        assert!(rebuild.success);
529        assert_eq!(rebuild.reindexed_rows, 1);
530
531        connection.execute_batch("DROP TABLE IF EXISTS memory_docs_vocab;")?;
532        connection.execute_batch(
533            "CREATE VIRTUAL TABLE IF NOT EXISTS memory_docs_vocab USING fts5vocab(
534                memory_docs,
535                'instance'
536            );",
537        )?;
538        let after_count: i64 = connection.query_row(
539            "SELECT count(*) FROM memory_docs_vocab WHERE term = ?1",
540            params!["田-女士"],
541            |row| row.get(0),
542        )?;
543        assert_eq!(after_count, 1);
544
545        Ok(())
546    }
547}