tcvectordb 0.1.9

Rust SDK for Tencent Cloud VectorDB
Documentation
use tcvectordb::{
    document::{AnnSearch, KeywordSearch, Rerank},
    enums::{FieldType, IndexType, MetricType, ReadConsistency},
    index::HNSWParams,
    Document, Embedding, FilterIndex, Index, Result, SparseIndex, VectorDBClient, VectorIndex,
};

#[tokio::main]
async fn main() -> Result<()> {
    println!("🔍 Python Hybrid Search Complete Example - Python 混合搜索完整示例");

    // 设置连接参数
    let url = std::env::var("VECTORDB_URL").unwrap_or_else(|_| "http://localhost:8100".to_string());
    let username = std::env::var("VECTORDB_USERNAME").unwrap_or_else(|_| "root".to_string());
    let api_key = std::env::var("VECTORDB_API_KEY")
        .expect("VECTORDB_API_KEY environment variable is required");

    // 创建客户端
    let client = VectorDBClient::new(
        &url,
        &username,
        &api_key,
        ReadConsistency::EventualConsistency,
        30,
    )?;

    println!("✅ VectorDB client created successfully");

    // 创建数据库
    let db_name = "db-test";
    let db = client.create_database_if_not_exists(db_name).await?;
    println!("✅ Database '{}' ready", db_name);

    // 创建支持混合搜索的索引配置
    let mut index = Index::new();

    // 主键索引
    index.add_filter_index(FilterIndex::new(
        "id",
        FieldType::String,
        IndexType::PRIMARY_KEY,
    ))?;

    // 密集向量索引(用于语义搜索)
    index.add_vector_index(VectorIndex::new(
        "vector",
        768, // bge-base-zh模型的维度
        IndexType::HNSW,
        MetricType::COSINE,
        Some(tcvectordb::index::IndexParams::HNSW(HNSWParams::new(
            16, 200,
        ))),
    ))?;

    // 稀疏向量索引(用于关键词搜索/BM25)
    index.add_sparse_index(SparseIndex::new(
        "sparse_vector",
        IndexType::SPARSE_INVERTED,
        MetricType::IP,
    ))?;

    // 文本字段索引
    index.add_filter_index(FilterIndex::new(
        "text",
        FieldType::String,
        IndexType::FILTER,
    ))?;

    // Embedding 配置
    let embedding = Embedding::new("vector", "text").with_model_name("bge-base-zh");

    println!("🔧 Creating collection with hybrid search support...");

    // 创建集合
    let collection_name = "book-emb";
    let collection = db
        .create_collection_if_not_exists(
            collection_name,
            1, // shard
            0, // replicas
            Some("Collection for Python hybrid search equivalent demo".to_string()),
            Some(index),
            Some(embedding),
            None, // ttl_config
        )
        .await?;

    println!(
        "✅ Collection '{}' ready for hybrid search",
        collection.name()
    );

    // 插入测试文档(包含稀疏向量)
    println!("\n📝 Inserting test documents with sparse vectors...");

    // 模拟稀疏向量数据(在实际应用中,这些应该由 BM25 编码器生成)
    let sparse_vector_1 = vec![vec![0.8, 0.9, 0.7, 0.5]]; // 腾讯云向量数据库相关
    let sparse_vector_2 = vec![vec![0.9, 0.8, 0.7, 0.4]]; // 机器学习相关
    let sparse_vector_3 = vec![vec![0.8, 0.7, 0.6, 0.3]]; // 云计算相关

    let documents = vec![
        Document::new()
            .with_id("doc1")
            .with_field("text", "腾讯云向量数据库是一种高性能的向量存储和检索服务")
            .with_field("sparse_vector", sparse_vector_1),
        Document::new()
            .with_id("doc2")
            .with_field("text", "机器学习算法在人工智能领域发挥重要作用")
            .with_field("sparse_vector", sparse_vector_2),
        Document::new()
            .with_id("doc3")
            .with_field("text", "云计算服务器提供可靠的数据存储解决方案")
            .with_field("sparse_vector", sparse_vector_3),
    ];

    match collection.upsert(documents, None, true).await {
        Ok(_) => println!("{} documents inserted successfully", 3),
        Err(e) => {
            println!("❌ Failed to insert documents: {}", e);
            println!("💡 Continuing with search demo anyway...");
        }
    }

    // 等待索引构建
    println!("\n⏳ Waiting for index to build...");
    tokio::time::sleep(tokio::time::Duration::from_secs(3)).await;

    // 模拟 BM25 编码器
    // Python: bm25 = BM25Encoder.default('zh')
    // Python: data=bm25.encode_queries('向量数据库')
    let bm25_encoded_query = vec![vec![
        vec![0.8, 0.6, 0.4, 0.3], // 模拟 BM25 编码 '向量数据库' 的结果
    ]];

    println!("\n🔍 Executing hybrid search...");
    println!("📝 Query: '什么是腾讯云向量数据库'");
    println!("🔑 BM25 Keywords: '向量数据库'");

    // Python 等价代码:
    // ann=[AnnSearch(field_name="text", data='什么是腾讯云向量数据库', limit=2)]
    let ann_search = vec![AnnSearch::new()
        .with_field_name("text")
        .with_text("什么是腾讯云向量数据库")
        .with_limit(2)];

    // Python 等价代码:
    // match=[KeywordSearch(field_name="sparse_vector", terminate_after=4000,
    //                     cutoff_frequency=0.1, data=bm25.encode_queries('向量数据库'), limit=2)]
    let keyword_search = KeywordSearch::new()
        .with_field_name("sparse_vector")
        .with_terminate_after(4000)
        .with_cutoff_frequency(0.1)
        .with_data(bm25_encoded_query)
        .with_limit(2);

    // Python 等价代码:
    // rerank=WeightedRerank(field_list=['vector', 'sparse_vector'], weight=[0.9, 0.1])
    let rerank = Rerank::weighted(
        vec!["vector".to_string(), "sparse_vector".to_string()],
        vec![0.9, 0.1], // 90% 语义搜索权重,10% 关键词搜索权重
    );

    // Python 等价代码:
    // doc_lists = client.hybrid_search(
    //     database_name='db-test',
    //     collection_name='book-emb',
    //     ann=ann,
    //     match=match,
    //     rerank=rerank,
    //     retrieve_vector=False,
    //     limit=3,
    // )
    match collection
        .hybrid_search(
            ann_search,
            Some(keyword_search),
            Some(rerank),
            3,    // limit
            None, // output_fields
            None, // timeout
        )
        .await
    {
        Ok(documents) => {
            println!("✅ Hybrid search successful!");

            // Python 等价代码:
            // for i, docs in enumerate(doc_lists):
            //     print(i)
            //     for doc in docs:
            //         print(doc)
            println!("\n📋 Found {} documents", documents.len());
            for (i, doc) in documents.iter().enumerate() {
                println!("\n   📄 Document {}:", i);
                println!("      ID: {:?}", doc.get_id());

                // 打印所有字段
                if let Some(text) = doc.get("text") {
                    println!("      Text: {:?}", text);
                }
                if let Some(score) = doc.get("_score") {
                    println!("      Score: {:?}", score);
                }

                // 打印完整文档(类似 Python 的 print(doc))
                println!("      Full Document: {:?}", doc);
            }

            println!("\n🎯 Search completed successfully!");
        }
        Err(e) => {
            println!("❌ Hybrid search failed: {}", e);
            println!("\n💡 This demonstrates the API structure even if the search fails.");
            println!("   In a real scenario with proper BM25 encoding, this would work.");
        }
    }

    // 清理
    println!("\n🧹 Cleaning up...");
    match db.drop_collection(collection_name).await {
        Ok(_) => println!("✅ Collection '{}' dropped", collection_name),
        Err(e) => println!("⚠️  Failed to drop collection: {}", e),
    }

    match client.drop_database(db_name).await {
        Ok(_) => println!("✅ Database '{}' dropped", db_name),
        Err(e) => println!("⚠️  Failed to drop database: {}", e),
    }

    println!("\n🎉 Python hybrid search equivalent demo completed!");

    println!("\n📚 Python to Rust Mapping Summary:");
    println!("+-------------------------------------------------------------+");
    println!("| Python Code                   | Rust Equivalent             |");
    println!("+-------------------------------------------------------------+");
    println!("| from tcvectordb.model.document| use tcvectordb::document::  |");
    println!("| import AnnSearch, KeywordSearch| {{AnnSearch, KeywordSearch}} |");
    println!("|                               |                             |");
    println!("| bm25 = BM25Encoder.default()  | // Use external BM25 lib    |");
    println!("| data = bm25.encode_queries()  | let data = vec![vec![...]]; |");
    println!("|                               |                             |");
    println!("| AnnSearch(field_name=\"text\",  | AnnSearch::new()            |");
    println!("|           data='query')       |   .with_field_name(\"text\")  |");
    println!("|                               |   .with_text(\"query\")       |");
    println!("|                               |                             |");
    println!("| KeywordSearch(field_name=..., | KeywordSearch::new()        |");
    println!("|               data=bm25_data) |   .with_field_name(...)     |");
    println!("|                               |   .with_data(bm25_data)     |");
    println!("|                               |                             |");
    println!("| WeightedRerank(field_list=[], | Rerank::weighted(           |");
    println!("|                weight=[])     |   vec![...], vec![...])     |");
    println!("|                               |                             |");
    println!("| client.hybrid_search(         | collection.hybrid_search(   |");
    println!("|   database_name='db-test',    |   ann_search,               |");
    println!("|   collection_name='book-emb', |   Some(keyword_search),     |");
    println!("|   ann=ann, match=match,       |   Some(rerank),             |");
    println!("|   rerank=rerank, limit=3)     |   3, None, None).await      |");
    println!("|                               |                             |");
    println!("| for i, docs in enumerate():   | for (i, doc) in             |");
    println!("|   for doc in docs:            |   documents.iter().enum():  |");
    println!("|     print(doc)                |     println!(\"{{:?}}\", doc)    |");
    println!("+-------------------------------------------------------------+");

    Ok(())
}