tf_idf_vectorizer/lib.rs
1//! # TF-IDF Vectorizer
2//!
3//! This crate provides a **document analysis engine** based on a highly customizable
4//! **TF-IDF vectorizer**.
5//!
6//! It is designed for:
7//! - Full-text search engines
8//! - Document similarity analysis
9//! - Large-scale corpus processing
10//!
11//! ## Architecture Overview
12//!
13//! The crate is composed of the following core concepts:
14//!
15//! - **Corpus**: Global document-frequency statistics (IDF base)
16//! - **TermFrequency**: Per-document term statistics (TF base)
17//! - **TFIDFVectorizer**: Converts documents into sparse TF-IDF vectors
18//! - **TFIDFEngine**: Pluggable TF / IDF calculation strategy
19//! - **SimilarityAlgorithm**: Multiple scoring algorithms (Cosine, Dot, BM25-like)
20//!
21//! ## Example
22//!
23//! ```rust
24//! use std::sync::Arc;
25//!
26//! use half::f16;
27//! use tf_idf_vectorizer::{Corpus, SimilarityAlgorithm, TFIDFVectorizer, TermFrequency, vectorizer::evaluate::query::Query};
28//!
29//! fn main() {
30//! // build corpus
31//! let corpus = Arc::new(Corpus::new());
32//!
33//! // make term frequencies
34//! let mut freq1 = TermFrequency::new();
35//! freq1.add_terms(&["rust", "高速", "並列", "rust"]);
36//! let mut freq2 = TermFrequency::new();
37//! freq2.add_terms(&["rust", "柔軟", "安全", "rust"]);
38//!
39//! // add documents to vectorizer
40//! let mut vectorizer: TFIDFVectorizer<f16> = TFIDFVectorizer::new(corpus);
41//! vectorizer.add_doc("doc1".to_string(), &freq1);
42//! vectorizer.add_doc("doc2".to_string(), &freq2);
43//! vectorizer.del_doc(&"doc1".to_string());
44//! vectorizer.add_doc("doc3".to_string(), &freq1);
45//!
46//! let query = Query::and(Query::term("rust"), Query::term("安全"));
47//! let algorithm = SimilarityAlgorithm::CosineSimilarity;
48//! let mut result = vectorizer.search(&algorithm, query);
49//! result.sort_by_score_desc();
50//!
51//! // print result
52//! println!("Search Results: \n{}", result);
53//! // debug
54//! println!("result count: {}", result.list.len());
55//! println!("{:?}", vectorizer);
56//! }
57//! ```
58//!
59//! ## Thread Safety
60//!
61//! - `Corpus` is thread-safe and can be shared across vectorizers
62//! - Designed for parallel indexing and search workloads
63//!
64//! ## Serialization
65//!
66//! - `TFIDFVectorizer` and `TFIDFData` support serialization
67//! - `TFIDFData` does **not** hold a `Corpus` reference and is suitable for storage
68
69pub mod vectorizer;
70pub mod utils;
71
72
73pub use vectorizer::TFIDFVectorizer;
74
75
76pub use vectorizer::serde::TFIDFData;
77
78
79pub use vectorizer::corpus::Corpus;
80
81
82pub use vectorizer::term::TermFrequency;
83
84
85pub use vectorizer::tfidf::{DefaultTFIDFEngine, TFIDFEngine};
86
87
88pub use vectorizer::evaluate::scoring::SimilarityAlgorithm;
89
90
91pub use vectorizer::evaluate::query::Query;
92
93
94pub use vectorizer::evaluate::scoring::{Hits, HitEntry};