1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
//! Text vectorization for machine learning.
//!
//! This module provides vectorization tools to convert text documents into numerical
//! feature vectors suitable for machine learning models:
//!
//! - **`CountVectorizer`**: Bag of Words representation (word counts)
//! - **`TfidfVectorizer`**: TF-IDF weighted features (term frequency-inverse document frequency)
//!
//! # Design Principles
//!
//! - Zero `unwrap()` calls (Cloudflare-class safety)
//! - Result-based error handling
//! - Comprehensive test coverage (≥95%)
//! - Integration with tokenizers and stop words
//!
//! # Quick Start
//!
//! ```
//! use aprender::text::vectorize::CountVectorizer;
//! use aprender::text::tokenize::WhitespaceTokenizer;
//!
//! let documents = vec![
//! "the cat sat on the mat",
//! "the dog sat on the log",
//! ];
//!
//! let mut vectorizer = CountVectorizer::new()
//! .with_tokenizer(Box::new(WhitespaceTokenizer::new()));
//!
//! let matrix = vectorizer.fit_transform(&documents).expect("vectorization should succeed");
//! // matrix shape: (2 documents, vocabulary_size features)
//! ```
use crateMatrix;
use crateStopWordsFilter;
use crateTokenizer;
use crateAprenderError;
use DefaultHasher;
use HashMap;
use ;
/// Bag of Words vectorizer that converts text to word count matrix.
///
/// Transforms a collection of text documents into a matrix of token counts.
/// Each row represents a document, each column represents a token in the vocabulary.
///
/// # Examples
///
/// ```
/// use aprender::text::vectorize::CountVectorizer;
/// use aprender::text::tokenize::WhitespaceTokenizer;
///
/// let docs = vec!["cat dog", "dog bird", "cat bird bird"];
///
/// let mut vectorizer = CountVectorizer::new()
/// .with_tokenizer(Box::new(WhitespaceTokenizer::new()));
///
/// let matrix = vectorizer.fit_transform(&docs).expect("fit_transform should succeed");
/// assert_eq!(matrix.n_rows(), 3); // 3 documents
/// assert_eq!(matrix.n_cols(), 3); // 3 unique words
/// ```
include!;
include!;
include!;