tf_idf_vectorizer/
lib.rs

1//! # TF-IDF Vectorizer
2//!
3//! This crate provides a **document analysis engine** based on a highly customizable
4//! **TF-IDF vectorizer**.
5//!
6//! It is designed for:
7//! - Full-text search engines
8//! - Document similarity analysis
9//! - Large-scale corpus processing
10//!
11//! ## Architecture Overview
12//!
13//! The crate is composed of the following core concepts:
14//!
15//! - **Corpus**: Global document-frequency statistics (IDF base)
16//! - **TokenFrequency**: Per-document token statistics (TF base)
17//! - **TFIDFVectorizer**: Converts documents into sparse TF-IDF vectors
18//! - **TFIDFEngine**: Pluggable TF / IDF calculation strategy
19//! - **SimilarityAlgorithm**: Multiple scoring algorithms (Cosine, Dot, BM25-like)
20//!
21//! ## Example
22//!
23//! ```rust
24//! use std::sync::Arc;
25//! 
26//! use tf_idf_vectorizer::{Corpus, SimilarityAlgorithm, TFIDFVectorizer, TokenFrequency, vectorizer::evaluate::query::Query};
27//! 
28//! fn main() {
29//!     // build corpus
30//!     let corpus = Arc::new(Corpus::new());
31//! 
32//!     // make token frequencies
33//!     let mut freq1 = TokenFrequency::new();
34//!     freq1.add_tokens(&["rust", "高速", "並列", "rust"]);
35//!     let mut freq2 = TokenFrequency::new();
36//!     freq2.add_tokens(&["rust", "柔軟", "安全", "rust"]);
37//! 
38//!     // add documents to vectorizer
39//!     let mut vectorizer: TFIDFVectorizer<u16> = TFIDFVectorizer::new(corpus);    
40//!     vectorizer.add_doc("doc1".to_string(), &freq1);
41//!     vectorizer.add_doc("doc2".to_string(), &freq2);
42//!     vectorizer.del_doc(&"doc1".to_string());
43//!     vectorizer.add_doc("doc3".to_string(), &freq1);
44//! 
45//!     let query = Query::and(Query::token("rust"), Query::token("安全"));
46//!     let algorithm = SimilarityAlgorithm::CosineSimilarity;
47//!     let mut result = vectorizer.search(&algorithm, query);
48//!     result.sort_by_score_desc();
49//! 
50//!     // print result
51//!     println!("Search Results: \n{}", result);
52//!     // debug
53//!     println!("result count: {}", result.list.len());
54//!     println!("{:?}", vectorizer);
55//! }
56//! ```
57//!
58//! ## Thread Safety
59//!
60//! - `Corpus` is thread-safe and can be shared across vectorizers
61//! - Designed for parallel indexing and search workloads
62//!
63//! ## Serialization
64//!
65//! - `TFIDFVectorizer` and `TFIDFData` support serialization
66//! - `TFIDFData` does **not** hold a `Corpus` reference and is suitable for storage
67
68pub mod vectorizer;
69pub mod utils;
70
71#[doc = "## Core Vectorizer"]
72/// TF-IDF Vectorizer
73///
74/// The top-level struct of this crate, providing the main TF-IDF vectorizer features.
75///
76/// It converts a document collection into TF-IDF vectors and supports similarity
77/// computation and search functionality.
78///
79/// ### Internals
80/// - Corpus vocabulary
81/// - Sparse TF vectors per document
82/// - Token index mapping
83/// - Cached IDF vector
84/// - Pluggable TF-IDF engine
85/// - Inverted document index
86///
87/// ### Type Parameters
88/// - `N`: Vector parameter type (e.g., `f32`, `f64`, `u16`)
89/// - `K`: Document key type (e.g., `String`, `usize`)
90/// - `E`: TF-IDF calculation engine
91///
92/// ### Notes
93/// - Requires an `Arc<Corpus>` on construction
94/// - `Corpus` can be shared across multiple vectorizers
95///
96/// ### Serialization
97/// Supported.  
98/// Serialized data includes the `Corpus` reference.
99///
100/// For corpus-independent storage, use [`TFIDFData`].
101pub use vectorizer::TFIDFVectorizer;
102
103#[doc = "## Serializable Data Structures"]
104/// TF-IDF Vectorizer Data Structure (Corpus-free)
105///
106/// A compact, serializable representation of a TF-IDF vectorizer.
107///
108/// Unlike [`TFIDFVectorizer`], this struct does **not** hold a `Corpus` reference.
109/// It can be converted back into a `TFIDFVectorizer` by providing an `Arc<Corpus>`.
110///
111/// ### Use Cases
112/// - Persistent storage
113/// - Network transfer
114/// - Memory-efficient snapshots
115///
116/// ### Serialization
117/// Supported.
118///
119/// ### Deserialization
120/// Supported, including internal data expansion.
121pub use vectorizer::serde::TFIDFData;
122
123#[doc = "## Corpus & Statistics"]
124/// Corpus for TF-IDF Vectorizer
125///
126/// Manages global document-frequency statistics required for IDF calculation.
127///
128/// This struct does **not** store document text or identifiers.
129/// It only tracks:
130/// - Total number of documents
131/// - Number of documents containing each token
132///
133/// ### Thread Safety
134/// - Fully thread-safe
135/// - Implemented using `DashMap` and atomics
136///
137/// ### Notes
138/// - Must be shared via `Arc<Corpus>`
139/// - Can be reused across multiple vectorizers
140pub use vectorizer::corpus::Corpus;
141
142/// Token Frequency Structure
143///
144/// Manages per-document token statistics used for TF calculation.
145///
146/// Tracks:
147/// - Token occurrence counts
148/// - Total token count in the document
149///
150/// ### Use Cases
151/// - TF calculation
152/// - Token-level statistics
153pub use vectorizer::token::TokenFrequency;
154
155#[doc = "## TF-IDF Engines"]
156/// TF-IDF Calculation Engine Trait
157///
158/// Defines the behavior of a TF-IDF calculation engine.
159///
160/// Custom engines can be implemented and plugged into
161/// [`TFIDFVectorizer`].
162///
163/// A default implementation, [`DefaultTFIDFEngine`], is provided.
164///
165/// ### Supported Numeric Types
166/// - `f16`
167/// - `f32`
168/// - `f64`
169/// - `u8`
170/// - `u16`
171/// - `u32`
172pub use vectorizer::tfidf::{DefaultTFIDFEngine, TFIDFEngine};
173
174#[doc = "## Similarity & Search"]
175/// Similarity Algorithm
176///
177/// Defines scoring algorithms used during search.
178///
179/// ### Variants
180/// - `Contains`: Token containment check
181/// - `Dot`: Dot product (long documents)
182/// - `Cosine`: Cosine similarity (proper nouns)
183/// - `BM25Like`: BM25-inspired scoring
184pub use vectorizer::evaluate::scoring::SimilarityAlgorithm;
185
186/// Query Structure
187///
188/// Represents a search query with logical filtering conditions.
189pub use vectorizer::evaluate::query::Query;
190
191/// Search Results
192///
193/// - `Hits`: A collection of ranked search results
194/// - `HitEntry`: A single search result entry
195pub use vectorizer::evaluate::scoring::{Hits, HitEntry};