tf_idf_vectorizer/
lib.rs

1//! # TF-IDF Vectorizer
2//!
3//! This crate provides a **document analysis engine** based on a highly customizable
4//! **TF-IDF vectorizer**.
5//!
6//! It is designed for:
7//! - Full-text search engines
8//! - Document similarity analysis
9//! - Large-scale corpus processing
10//!
11//! ## Architecture Overview
12//!
13//! The crate is composed of the following core concepts:
14//!
15//! - **Corpus**: Global document-frequency statistics (IDF base)
16//! - **TermFrequency**: Per-document term statistics (TF base)
17//! - **TFIDFVectorizer**: Converts documents into sparse TF-IDF vectors
18//! - **TFIDFEngine**: Pluggable TF / IDF calculation strategy
19//! - **SimilarityAlgorithm**: Multiple scoring algorithms (Cosine, Dot, BM25-like)
20//!
21//! ## Example
22//!
23//! ```rust
24//! use std::sync::Arc;
25//! 
26//! use half::f16;
27//! use tf_idf_vectorizer::{Corpus, SimilarityAlgorithm, TFIDFVectorizer, TermFrequency, vectorizer::evaluate::query::Query};
28//! 
29//! fn main() {
30//!     // build corpus
31//!     let corpus = Arc::new(Corpus::new());
32//! 
33//!     // make term frequencies
34//!     let mut freq1 = TermFrequency::new();
35//!     freq1.add_terms(&["rust", "高速", "並列", "rust"]);
36//!     let mut freq2 = TermFrequency::new();
37//!     freq2.add_terms(&["rust", "柔軟", "安全", "rust"]);
38//! 
39//!     // add documents to vectorizer
40//!     let mut vectorizer: TFIDFVectorizer<f16> = TFIDFVectorizer::new(corpus);    
41//!     vectorizer.add_doc("doc1".to_string(), &freq1);
42//!     vectorizer.add_doc("doc2".to_string(), &freq2);
43//!     vectorizer.del_doc(&"doc1".to_string());
44//!     vectorizer.add_doc("doc3".to_string(), &freq1);
45//! 
46//!     let query = Query::and(Query::term("rust"), Query::term("安全"));
47//!     let algorithm = SimilarityAlgorithm::CosineSimilarity;
48//!     let mut result = vectorizer.search(&algorithm, query);
49//!     result.sort_by_score_desc();
50//! 
51//!     // print result
52//!     println!("Search Results: \n{}", result);
53//!     // debug
54//!     println!("result count: {}", result.list.len());
55//!     println!("{:?}", vectorizer);
56//! }
57//! ```
58//!
59//! ## Thread Safety
60//!
61//! - `Corpus` is thread-safe and can be shared across vectorizers
62//! - Designed for parallel indexing and search workloads
63//!
64//! ## Serialization
65//!
66//! - `TFIDFVectorizer` and `TFIDFData` support serialization
67//! - `TFIDFData` does **not** hold a `Corpus` reference and is suitable for storage
68
69pub mod vectorizer;
70pub mod utils;
71
72
73pub use vectorizer::TFIDFVectorizer;
74
75
76pub use vectorizer::serde::TFIDFData;
77
78
79pub use vectorizer::corpus::Corpus;
80
81
82pub use vectorizer::term::TermFrequency;
83
84
85pub use vectorizer::tfidf::{DefaultTFIDFEngine, TFIDFEngine};
86
87
88pub use vectorizer::evaluate::scoring::SimilarityAlgorithm;
89
90
91pub use vectorizer::evaluate::query::Query;
92
93
94pub use vectorizer::evaluate::scoring::{Hits, HitEntry};