1#![allow(unused_variables)]
2
3use crate::cd::{encoding_languages, mb_encoding_languages};
4use crate::consts::{IANA_SUPPORTED_ALIASES, TOO_BIG_SEQUENCE};
5use crate::utils::{decode, iana_name, is_multi_byte_encoding, range_scan};
6use encoding::DecoderTrap;
7use ordered_float::OrderedFloat;
8use std::borrow::Cow;
9use std::cmp::Ordering;
10use std::fmt;
11use std::fmt::{Debug, Display, Formatter};
12use std::hash::Hash;
13use std::ops::Index;
14
15#[derive(Debug, PartialEq, Eq, Hash)]
20pub enum Language {
21 English,
22 German,
23 French,
24 Dutch,
25 Italian,
26 Polish,
27 Spanish,
28 Russian,
29 Japanese,
30 Portuguese,
31 Swedish,
32 Chinese,
33 Ukrainian,
34 Norwegian,
35 Finnish,
36 Vietnamese,
37 Czech,
38 Hungarian,
39 Korean,
40 Indonesian,
41 Turkish,
42 Romanian,
43 Farsi,
44 Arabic,
45 Danish,
46 Serbian,
47 Lithuanian,
48 Slovene,
49 Slovak,
50 Hebrew,
51 Bulgarian,
52 Croatian,
53 Hindi,
54 Estonian,
55 Thai,
56 Greek,
57 Tamil,
58 Kazakh,
59 Unknown,
60}
61
62impl Display for Language {
63 fn fmt(&self, f: &mut Formatter) -> fmt::Result {
64 write!(f, "{:?}", self)
65 }
66}
67
68#[derive(Debug, PartialEq, Clone)]
73pub(crate) struct CoherenceMatch {
74 pub language: &'static Language,
75 pub score: OrderedFloat<f32>,
76}
77
78pub(crate) type CoherenceMatches = Vec<CoherenceMatch>;
79
80#[derive(Clone)]
85pub struct CharsetMatch {
86 payload: Cow<'static, [u8]>,
87 encoding: String,
88
89 mean_mess_ratio: OrderedFloat<f32>,
90 coherence_matches: CoherenceMatches,
91
92 has_sig_or_bom: bool,
93
94 submatch: Vec<CharsetMatch>,
95 decoded_payload: Option<String>,
96}
97
98impl Display for CharsetMatch {
99 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
100 write!(f, "{:?} ({})", self.payload, self.encoding)
101 }
102}
103
104impl Debug for CharsetMatch {
105 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
106 write!(f, "{:?} ({})", self.payload, self.encoding)
107 }
108}
109
110impl Default for CharsetMatch {
111 fn default() -> Self {
112 CharsetMatch {
113 payload: Cow::Borrowed(&[]),
114 encoding: "utf-8".to_string(),
115 mean_mess_ratio: OrderedFloat(0.0),
116 coherence_matches: vec![],
117 has_sig_or_bom: false,
118 submatch: vec![],
119 decoded_payload: None,
120 }
121 }
122}
123
124impl PartialEq<Self> for CharsetMatch {
125 fn eq(&self, other: &Self) -> bool {
126 self.encoding == other.encoding && self.decoded_payload == other.decoded_payload
127 }
128}
129
130impl Eq for CharsetMatch {}
131
132impl Ord for CharsetMatch {
133 fn cmp(&self, other: &Self) -> Ordering {
134 let mess_difference = (self.mean_mess_ratio - other.mean_mess_ratio).abs();
135 let coherence_a = OrderedFloat(self.coherence());
136 let coherence_b = OrderedFloat(other.coherence());
137 let coherence_difference = (coherence_a - coherence_b).abs();
138
139 if mess_difference < 0.01 {
141 if coherence_difference > 0.02 {
142 return coherence_b.cmp(&coherence_a);
143 }
144 let multibyte_usage_a = OrderedFloat(self.multi_byte_usage());
145 let multibyte_usage_b = OrderedFloat(other.multi_byte_usage());
146 let multibyte_usage_delta = (multibyte_usage_a - multibyte_usage_b).abs();
147 if multibyte_usage_delta > f32::EPSILON {
148 return multibyte_usage_b.cmp(&multibyte_usage_a);
149 }
150 }
151 self.mean_mess_ratio.cmp(&other.mean_mess_ratio)
152 }
153}
154
155impl PartialOrd<Self> for CharsetMatch {
156 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
157 Some(self.cmp(other))
158 }
159}
160
161impl CharsetMatch {
162 pub(crate) fn new(
164 payload: Cow<'static, [u8]>,
165 encoding: &str,
166 mean_mess_ratio: f32,
167 has_sig_or_bom: bool,
168 coherence_matches: &CoherenceMatches,
169 decoded_payload: Option<&str>,
170 ) -> Self {
171 CharsetMatch {
172 payload: payload.clone(),
173 encoding: String::from(encoding),
174 mean_mess_ratio: OrderedFloat(mean_mess_ratio),
175 coherence_matches: coherence_matches.clone(),
176 has_sig_or_bom,
177 submatch: vec![],
178 decoded_payload: decoded_payload.map(String::from).or_else(|| {
179 decode(&payload, encoding, DecoderTrap::Strict, false, true)
180 .ok()
181 .map(|res| res.strip_prefix('\u{feff}').unwrap_or(&res).to_string())
182 }),
183 }
184 }
185
186 pub(crate) fn add_submatch(&mut self, submatch: &CharsetMatch) {
188 self.submatch.push(submatch.clone());
189 }
191
192 pub fn encoding_aliases(&self) -> Vec<&'static str> {
194 IANA_SUPPORTED_ALIASES
195 .get(self.encoding.as_str())
196 .cloned()
197 .expect("Problem with static HashMap IANA_SUPPORTED_ALIASES")
198 }
199 pub fn bom(&self) -> bool {
201 self.has_sig_or_bom
202 }
203 pub fn encoding(&self) -> &str {
204 &self.encoding
205 }
206 pub fn chaos(&self) -> f32 {
207 self.mean_mess_ratio.0
208 }
209 pub fn most_probably_language(&self) -> &'static Language {
212 self.coherence_matches.first().map_or_else(
213 || {
215 if self.suitable_encodings().contains(&String::from("ascii")) {
216 &Language::English
217 } else {
218 let languages = if is_multi_byte_encoding(&self.encoding) {
219 mb_encoding_languages(&self.encoding)
220 } else {
221 encoding_languages(self.encoding.clone())
222 };
223 languages.first().copied().unwrap_or(&Language::Unknown)
224 }
225 },
226 |lang| lang.language,
227 )
228 }
229 pub fn languages(&self) -> Vec<&'static Language> {
232 self.coherence_matches
233 .iter()
234 .map(|cm| cm.language)
235 .collect()
236 }
237 pub fn has_submatch(&self) -> bool {
239 !self.submatch.is_empty()
240 }
241 pub fn submatch(&self) -> &Vec<CharsetMatch> {
243 &self.submatch
244 }
245 pub fn multi_byte_usage(&self) -> f32 {
247 let decoded_chars = self.decoded_payload().unwrap_or_default().chars().count() as f32;
248 let payload_len = self.payload.len() as f32;
249
250 1.0 - (decoded_chars / payload_len)
251 }
252 pub fn raw(&self) -> &[u8] {
254 &self.payload
255 }
256 pub fn chaos_percents(&self) -> f32 {
258 self.chaos() * 100.0
259 }
260 pub fn coherence_percents(&self) -> f32 {
262 self.coherence() * 100.0
263 }
264 pub fn coherence(&self) -> f32 {
266 self.coherence_matches
267 .first()
268 .map(|lang| lang.score.0)
269 .unwrap_or_default()
270 }
271
272 pub fn decoded_payload(&self) -> Option<&str> {
274 self.decoded_payload.as_deref()
275 }
276
277 pub fn suitable_encodings(&self) -> Vec<String> {
280 std::iter::once(self.encoding.clone())
281 .chain(self.submatch.iter().map(|s| s.encoding.clone()))
282 .collect()
283 }
284 pub fn unicode_ranges(&self) -> Vec<String> {
286 let mut ranges: Vec<String> = range_scan(self.decoded_payload().unwrap_or_default())
287 .iter()
288 .cloned()
289 .collect();
290 ranges.sort_unstable();
291 ranges
292 }
293}
294
295#[derive(Debug, Default)]
302pub struct CharsetMatches {
303 items: Vec<CharsetMatch>,
304}
305
306pub struct CharsetMatchesIterMut<'a> {
307 items: std::slice::IterMut<'a, CharsetMatch>,
308}
309
310pub struct CharsetMatchesIter<'a> {
311 items: std::slice::Iter<'a, CharsetMatch>,
312}
313
314impl CharsetMatches {
315 pub fn new(items: Option<Vec<CharsetMatch>>) -> Self {
317 let mut items = items.unwrap_or_default();
318 CharsetMatches::resort(&mut items);
319 CharsetMatches { items }
320 }
321 pub fn from_single(item: CharsetMatch) -> Self {
322 CharsetMatches { items: vec![item] }
323 }
324 pub fn append(&mut self, item: CharsetMatch) {
327 if item.payload.len() <= TOO_BIG_SEQUENCE {
330 for m in &mut self.items {
331 if m.decoded_payload() == item.decoded_payload()
332 && (m.mean_mess_ratio - item.mean_mess_ratio).abs() < f32::EPSILON
333 {
334 m.add_submatch(&item);
335 return;
336 }
337 }
338 }
339 self.items.push(item);
340 CharsetMatches::resort(&mut self.items);
341 }
342 pub fn get_best(&self) -> Option<&CharsetMatch> {
344 self.items.first()
345 }
346 pub fn get_by_encoding(&self, encoding: &str) -> Option<&CharsetMatch> {
348 let encoding = iana_name(encoding)?;
349 self.items
350 .iter()
351 .find(|&i| i.suitable_encodings().contains(&encoding.to_string()))
352 }
353 fn resort(items: &mut [CharsetMatch]) {
355 items.sort_unstable();
356 }
357 pub fn iter_mut(&mut self) -> CharsetMatchesIterMut<'_> {
359 CharsetMatchesIterMut {
360 items: self.items.iter_mut(),
361 }
362 }
363 pub fn iter(&self) -> CharsetMatchesIter<'_> {
364 CharsetMatchesIter {
365 items: self.items.iter(),
366 }
367 }
368 pub fn len(&self) -> usize {
370 self.items.len()
371 }
372 pub fn is_empty(&self) -> bool {
374 self.items.is_empty()
375 }
376}
377
378impl Index<usize> for CharsetMatches {
379 type Output = CharsetMatch;
380 fn index(&self, index: usize) -> &Self::Output {
381 &self.items[index]
382 }
383}
384
385impl<'a> Iterator for CharsetMatchesIterMut<'a> {
386 type Item = &'a mut CharsetMatch;
387
388 fn next(&mut self) -> Option<Self::Item> {
389 self.items.next()
390 }
391}
392
393impl<'a> Iterator for CharsetMatchesIter<'a> {
394 type Item = &'a CharsetMatch;
395
396 fn next(&mut self) -> Option<Self::Item> {
397 self.items.next()
398 }
399}
400
401#[derive(Clone)]
402pub struct NormalizerSettings {
403 pub steps: usize,
405 pub chunk_size: usize,
407 pub threshold: OrderedFloat<f32>,
409 pub include_encodings: Vec<String>,
411 pub exclude_encodings: Vec<String>,
413 pub preemptive_behaviour: bool,
415 pub language_threshold: OrderedFloat<f32>,
417 pub enable_fallback: bool,
419}
420
421impl Default for NormalizerSettings {
422 fn default() -> Self {
423 NormalizerSettings {
424 steps: 5,
425 chunk_size: 512,
426 threshold: OrderedFloat(0.2),
427 include_encodings: vec![],
428 exclude_encodings: vec![],
429 preemptive_behaviour: true,
430 language_threshold: OrderedFloat(0.1),
431 enable_fallback: true,
432 }
433 }
434}