tf_idf_vectorizer/vectorizer/
serde.rs1use std::sync::Arc;
2use std::hash::Hash;
3
4use ahash::RandomState;
5use num_traits::Num;
6use serde::{ser::SerializeStruct, Deserialize, Serialize};
7
8use crate::{Corpus, TFIDFVectorizer, utils::datastruct::{map::IndexMap, vector::ZeroSpVecTrait}, vectorizer::{IDFVector, KeyRc, TFVector, tfidf::{DefaultTFIDFEngine, TFIDFEngine}}};
9
10#[derive(Debug, Deserialize, Serialize)]
14pub struct TFIDFData<N = f32, K = String, E = DefaultTFIDFEngine>
15where
16 N: Num + Copy,
17 E: TFIDFEngine<N, K>,
18 K: Clone + Eq + Hash,
19{
20 pub documents: IndexMap<KeyRc<K>, TFVector<N>>,
22 pub token_dim_sample: Vec<Box<str>>,
24 #[serde(default, skip_serializing, skip_deserializing)]
26 pub idf: Option<IDFVector<N>>,
27 #[serde(default, skip_serializing, skip_deserializing)]
28 _marker: std::marker::PhantomData<E>,
29}
30
31impl<N, K, E> TFIDFData<N, K, E>
32where
33 N: Num + Copy + Into<f64> + Send + Sync,
34 E: TFIDFEngine<N, K>,
35 K: Clone + Send + Sync + Eq + Hash,
36{
37 pub fn into_tf_idf_vectorizer(self, corpus_ref: Arc<Corpus>) -> TFIDFVectorizer<N, K, E>
40 {
41 let raw_iter = self.documents.iter();
42 let mut token_dim_rev_index: IndexMap<Box<str>, Vec<KeyRc<K>>, RandomState> =
43 IndexMap::with_capacity(self.token_dim_sample.len());
44 self.token_dim_sample.iter().for_each(|token| {
46 token_dim_rev_index.insert(token.clone(), Vec::new());
47 });
48 for (key, doc) in raw_iter {
49 doc.tf_vec.raw_iter().for_each(|(idx, _)| {
50 let token = &self.token_dim_sample[idx];
51 token_dim_rev_index
52 .get_mut(token).unwrap()
53 .push(key.clone());
54 });
55 }
56
57 let mut instance = TFIDFVectorizer {
58 documents: self.documents,
59 token_dim_rev_index: token_dim_rev_index,
60 corpus_ref,
61 idf_cache: IDFVector::new(),
62 _marker: std::marker::PhantomData,
63 };
64 instance.update_idf();
65 instance
66 }
67}
68
69impl<N, K, E> Serialize for TFIDFVectorizer<N, K, E>
70where
71 N: Num + Copy + Serialize + Into<f64> + Send + Sync,
72 K: Serialize + Clone + Send + Sync + Eq + Hash,
73 E: TFIDFEngine<N, K>,
74{
75 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
79 where
80 S: serde::Serializer,
81 {
82 let mut state = serializer.serialize_struct("TFIDFVectorizer", 2)?;
83 state.serialize_field("documents", &self.documents)?;
84 state.serialize_field("token_dim_sample", &self.token_dim_rev_index.keys())?;
85 state.end()
86 }
87}