CFDTCP/
lib.rs

1//! 这是一个简单判断多个短文本相关性的库。【需提供一个中心点,也就是需要选择一个短文本作为参照】
2//!
3//! 主要适用于比如:相关文章推荐之后的,各个推荐标题是否过于紧密的判断
4//!
5//! ## 流程:
6//! 1. 使用AC自动机,去除停用词。【当然更好的方式是 分词,但是为了提高运行效率,未使用分词】
7//! 2. 使用 one-hot 方式生成各个句子和中心句子的句向量,存储到Bitmap。【更好的是:使用语言模型生成向量】
8//! 3. 计算各个句子向量到中心句之间的距离。
9//! 4. 对计算的距离 统计离散系数,可以看出其 聚合程度
10//! 5. 或者使用Jenks Natural Breaks 聚类,并查找最优的【标准差最小】的簇,簇越多表示越离散,越少越聚合 【更好的是:使用密度聚类算法】
11//! ## Example
12//! ```no run
13//!     let centor =  "感冒第二天了,嗓子完全沙哑了,怎么办";
14//!     let list = [
15//!                "感冒咳嗽引起嗓子沙哑",
16//!                "我是感冒引起的嗓子沙哑",
17//!                "感冒咳嗽流鼻涕嗓子沙哑",
18//!                "因感冒引起的嗓子沙哑",
19//!                "感冒引起了嗓子沙哑。完全说不出话来",
20//!                "前几天感冒嗓子有点沙哑",
21//!                "年前感冒引起的嗓子沙哑",
22//!                "我是感冒引起的嗓子沙哑",
23//!                "感冒四天了,嗓子沙哑",
24//!     ];
25//!     let mut cfdtcp = CFDTCP::CFDTCP::new();
26//!     cfdtcp.centor(centor.to_owned()).list(list.to_vec());
27//!     // 获取众位数 距离相同最多的
28//!     // @return (mode,count)
29//!     let mode = cfdtcp.mode().unwrap();
30//!     // 离散系数
31//!     // @return f64
32//!     let distribution = cfdtcp.distribution();
33//!     // 聚类 参数-表示最多计算的簇,比如有9个句子,最多只能分成9簇,越少计算越快,准确度越低
34//!     // @return (usize,Vec)
35//!     let class = cfdtcp.jenks_classify(9);
36//! ```
37use std::{collections::HashMap, fmt::Display, path::Path};
38
39use jenks::get_jenks_classification;
40use text_processing::Processing;
41use utilities::Classification;
42use vectorization::Vectorization;
43
44mod jenks;
45pub mod text_processing;
46mod utilities;
47pub mod vectorization;
48
49// 众位数
50pub fn mode(v: &Vec<u64>) -> Option<(u64, i32)> {
51    let frequencies = v.iter().fold(HashMap::new(), |mut freqs, value| {
52        *freqs.entry(value).or_insert(0) += 1;
53        freqs
54    });
55    frequencies
56        .into_iter()
57        .max_by_key(|&(_, count)| count)
58        .map(|(value, count)| (*value, count))
59}
60// 平均值
61pub fn mean(v: &Vec<u64>) -> Option<f64> {
62    let sum = v.iter().sum::<u64>() as f64;
63    let count = v.len();
64
65    match count {
66        positive if positive > 0 => {
67            Some(sum / count as f64)
68            // println!("mean:{:?}", m);
69            // m
70        }
71        _ => None,
72    }
73}
74// 标准差
75pub fn std_deviation(v: &Vec<u64>) -> Option<f64> {
76    match (mean(v), v.len()) {
77        (Some(data_mean), count) if count > 0 => {
78            let variance = v
79                .iter()
80                .map(|value| {
81                    let diff = data_mean - (*value as f64);
82
83                    diff * diff
84                })
85                .sum::<f64>()
86                / count as f64;
87            Some(variance.sqrt())
88        }
89        _ => None,
90    }
91}
92// 离散系数
93fn discrete_coefficient(v: &Vec<u64>) -> Option<f64> {
94    match (std_deviation(v), mean(v)) {
95        (Some(dev), Some(m)) => Some(dev / m),
96        _ => None,
97    }
98}
99pub struct CFDTCP<'a> {
100    v: Vectorization<'a>,
101    p: Processing,
102}
103
104impl<'a> CFDTCP<'a> {
105    pub fn new() -> CFDTCP<'a> {
106        let mut P = text_processing::Processing::new_file(Path::new(".stop_word.txt"));
107        P.set_ac();
108        let mut V = Vectorization::new();
109        Self { v: V, p: P }
110    }
111    pub fn centor(&mut self, c: String) -> &mut Self {
112        self.v.centor(self.p.parse(c));
113        self
114    }
115
116    pub fn list(&mut self, list: Vec<&'a str>) -> &mut Self {
117        self.v.list(list);
118        self
119    }
120
121    pub fn distribution(&mut self) -> Option<f64> {
122        discrete_coefficient(&self.v.get_dt())
123    }
124
125    pub fn mode(&mut self) -> Option<(u64, i32)> {
126        mode(&self.v.get_dt())
127    }
128
129    pub fn jenks_classify(&mut self, num_bins: usize) -> (usize, Classification) {
130        let mut c = Classification::new();
131        let mut n: usize = 0;
132        let mut min = f64::MAX;
133        let data = self.v.get_dt();
134        for num_bin in 1..num_bins {
135            let class = get_jenks_classification(num_bin, data);
136            let max_dev = CFDTCP::one_jenks_max_std_deviation(data, &class);
137            // println!("max:{} min:{} n:{}", max_dev, min, num_bin);
138            if max_dev < min {
139                min = max_dev;
140                n = num_bin;
141                c = class;
142            }
143        }
144        (n + 1, c)
145    }
146    fn one_jenks_max_std_deviation(dt: &Vec<u64>, res: &Classification) -> f64 {
147        let mut max = 0.0;
148        let start = 0;
149        for bin in res {
150            match std_deviation(&dt[start as usize..start + bin.count as usize].to_vec()) {
151                Some(v) => {
152                    if v > max {
153                        max = v
154                    }
155                }
156                None => {}
157            }
158        }
159        max
160    }
161
162    pub fn get_dt(&mut self) -> &Vec<u64> {
163        self.v.get_dt()
164    }
165
166    pub fn clear(&mut self) {}
167}
168
169pub struct MyClassification(pub Classification);
170
171impl Display for MyClassification {
172    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
173        let mut vec = Vec::new();
174        for bin in &self.0 {
175            vec.push(format!(
176                "start:{} end:{} count:{}",
177                bin.bin_start, bin.bin_end, bin.count
178            ));
179        }
180        write!(f, "{:?}", vec)
181    }
182}