charset_normalizer_rs/
entity.rs

1#![allow(unused_variables)]
2
3use crate::cd::{encoding_languages, mb_encoding_languages};
4use crate::consts::{IANA_SUPPORTED_ALIASES, TOO_BIG_SEQUENCE};
5use crate::utils::{decode, iana_name, is_multi_byte_encoding, range_scan};
6use encoding::DecoderTrap;
7use ordered_float::OrderedFloat;
8use std::borrow::Cow;
9use std::cmp::Ordering;
10use std::fmt;
11use std::fmt::{Debug, Display, Formatter};
12use std::hash::Hash;
13use std::ops::Index;
14
15/////////////////////////////////////////////////////////////////////////////////////
16// Languages
17/////////////////////////////////////////////////////////////////////////////////////
18
19#[derive(Debug, PartialEq, Eq, Hash)]
20pub enum Language {
21    English,
22    German,
23    French,
24    Dutch,
25    Italian,
26    Polish,
27    Spanish,
28    Russian,
29    Japanese,
30    Portuguese,
31    Swedish,
32    Chinese,
33    Ukrainian,
34    Norwegian,
35    Finnish,
36    Vietnamese,
37    Czech,
38    Hungarian,
39    Korean,
40    Indonesian,
41    Turkish,
42    Romanian,
43    Farsi,
44    Arabic,
45    Danish,
46    Serbian,
47    Lithuanian,
48    Slovene,
49    Slovak,
50    Hebrew,
51    Bulgarian,
52    Croatian,
53    Hindi,
54    Estonian,
55    Thai,
56    Greek,
57    Tamil,
58    Kazakh,
59    Unknown,
60}
61
62impl Display for Language {
63    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
64        write!(f, "{:?}", self)
65    }
66}
67
68/////////////////////////////////////////////////////////////////////////////////////
69// CoherenceMatch & CoherenceMatches
70/////////////////////////////////////////////////////////////////////////////////////
71
72#[derive(Debug, PartialEq, Clone)]
73pub(crate) struct CoherenceMatch {
74    pub language: &'static Language,
75    pub score: OrderedFloat<f32>,
76}
77
78pub(crate) type CoherenceMatches = Vec<CoherenceMatch>;
79
80/////////////////////////////////////////////////////////////////////////////////////
81// CharsetMatch
82/////////////////////////////////////////////////////////////////////////////////////
83
84#[derive(Clone)]
85pub struct CharsetMatch {
86    payload: Cow<'static, [u8]>,
87    encoding: String,
88
89    mean_mess_ratio: OrderedFloat<f32>,
90    coherence_matches: CoherenceMatches,
91
92    has_sig_or_bom: bool,
93
94    submatch: Vec<CharsetMatch>,
95    decoded_payload: Option<String>,
96}
97
98impl Display for CharsetMatch {
99    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
100        write!(f, "{:?} ({})", self.payload, self.encoding)
101    }
102}
103
104impl Debug for CharsetMatch {
105    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
106        write!(f, "{:?} ({})", self.payload, self.encoding)
107    }
108}
109
110impl Default for CharsetMatch {
111    fn default() -> Self {
112        CharsetMatch {
113            payload: Cow::Borrowed(&[]),
114            encoding: "utf-8".to_string(),
115            mean_mess_ratio: OrderedFloat(0.0),
116            coherence_matches: vec![],
117            has_sig_or_bom: false,
118            submatch: vec![],
119            decoded_payload: None,
120        }
121    }
122}
123
124impl PartialEq<Self> for CharsetMatch {
125    fn eq(&self, other: &Self) -> bool {
126        self.encoding == other.encoding && self.decoded_payload == other.decoded_payload
127    }
128}
129
130impl Eq for CharsetMatch {}
131
132impl Ord for CharsetMatch {
133    fn cmp(&self, other: &Self) -> Ordering {
134        let mess_difference = (self.mean_mess_ratio - other.mean_mess_ratio).abs();
135        let coherence_a = OrderedFloat(self.coherence());
136        let coherence_b = OrderedFloat(other.coherence());
137        let coherence_difference = (coherence_a - coherence_b).abs();
138
139        // Below 1% difference --> Use Coherence
140        if mess_difference < 0.01 {
141            if coherence_difference > 0.02 {
142                return coherence_b.cmp(&coherence_a);
143            }
144            let multibyte_usage_a = OrderedFloat(self.multi_byte_usage());
145            let multibyte_usage_b = OrderedFloat(other.multi_byte_usage());
146            let multibyte_usage_delta = (multibyte_usage_a - multibyte_usage_b).abs();
147            if multibyte_usage_delta > f32::EPSILON {
148                return multibyte_usage_b.cmp(&multibyte_usage_a);
149            }
150        }
151        self.mean_mess_ratio.cmp(&other.mean_mess_ratio)
152    }
153}
154
155impl PartialOrd<Self> for CharsetMatch {
156    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
157        Some(self.cmp(other))
158    }
159}
160
161impl CharsetMatch {
162    // Init function
163    pub(crate) fn new(
164        payload: Cow<'static, [u8]>,
165        encoding: &str,
166        mean_mess_ratio: f32,
167        has_sig_or_bom: bool,
168        coherence_matches: &CoherenceMatches,
169        decoded_payload: Option<&str>,
170    ) -> Self {
171        CharsetMatch {
172            payload: payload.clone(),
173            encoding: String::from(encoding),
174            mean_mess_ratio: OrderedFloat(mean_mess_ratio),
175            coherence_matches: coherence_matches.clone(),
176            has_sig_or_bom,
177            submatch: vec![],
178            decoded_payload: decoded_payload.map(String::from).or_else(|| {
179                decode(&payload, encoding, DecoderTrap::Strict, false, true)
180                    .ok()
181                    .map(|res| res.strip_prefix('\u{feff}').unwrap_or(&res).to_string())
182            }),
183        }
184    }
185
186    // Add submatch
187    pub(crate) fn add_submatch(&mut self, submatch: &CharsetMatch) {
188        self.submatch.push(submatch.clone());
189        //self.decoded_payload = None;
190    }
191
192    // Get encoding aliases according to https://encoding.spec.whatwg.org/encodings.json
193    pub fn encoding_aliases(&self) -> Vec<&'static str> {
194        IANA_SUPPORTED_ALIASES
195            .get(self.encoding.as_str())
196            .cloned()
197            .expect("Problem with static HashMap IANA_SUPPORTED_ALIASES")
198    }
199    // byte_order_mark
200    pub fn bom(&self) -> bool {
201        self.has_sig_or_bom
202    }
203    pub fn encoding(&self) -> &str {
204        &self.encoding
205    }
206    pub fn chaos(&self) -> f32 {
207        self.mean_mess_ratio.0
208    }
209    // Most probable language found in decoded sequence. If none were detected or inferred, the property will return
210    // Language::Unknown
211    pub fn most_probably_language(&self) -> &'static Language {
212        self.coherence_matches.first().map_or_else(
213            // Default case: Trying to infer the language based on the given encoding
214            || {
215                if self.suitable_encodings().contains(&String::from("ascii")) {
216                    &Language::English
217                } else {
218                    let languages = if is_multi_byte_encoding(&self.encoding) {
219                        mb_encoding_languages(&self.encoding)
220                    } else {
221                        encoding_languages(self.encoding.clone())
222                    };
223                    languages.first().copied().unwrap_or(&Language::Unknown)
224                }
225            },
226            |lang| lang.language,
227        )
228    }
229    // Return the complete list of possible languages found in decoded sequence.
230    // Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
231    pub fn languages(&self) -> Vec<&'static Language> {
232        self.coherence_matches
233            .iter()
234            .map(|cm| cm.language)
235            .collect()
236    }
237    // Has submatch
238    pub fn has_submatch(&self) -> bool {
239        !self.submatch.is_empty()
240    }
241    // Return submatch list
242    pub fn submatch(&self) -> &Vec<CharsetMatch> {
243        &self.submatch
244    }
245    // Multibyte usage ratio
246    pub fn multi_byte_usage(&self) -> f32 {
247        let decoded_chars = self.decoded_payload().unwrap_or_default().chars().count() as f32;
248        let payload_len = self.payload.len() as f32;
249
250        1.0 - (decoded_chars / payload_len)
251    }
252    // Original untouched bytes
253    pub fn raw(&self) -> &[u8] {
254        &self.payload
255    }
256    // Return chaos in percents with rounding
257    pub fn chaos_percents(&self) -> f32 {
258        self.chaos() * 100.0
259    }
260    // Return coherence in percents with rounding
261    pub fn coherence_percents(&self) -> f32 {
262        self.coherence() * 100.0
263    }
264    // Most relevant language coherence
265    pub fn coherence(&self) -> f32 {
266        self.coherence_matches
267            .first()
268            .map(|lang| lang.score.0)
269            .unwrap_or_default()
270    }
271
272    // To recalc decoded_payload field
273    pub fn decoded_payload(&self) -> Option<&str> {
274        self.decoded_payload.as_deref()
275    }
276
277    // The complete list of encodings that output the exact SAME str result and therefore could be the originating
278    // encoding. This list does include the encoding available in property 'encoding'.
279    pub fn suitable_encodings(&self) -> Vec<String> {
280        std::iter::once(self.encoding.clone())
281            .chain(self.submatch.iter().map(|s| s.encoding.clone()))
282            .collect()
283    }
284    // Returns sorted list of unicode ranges (if exists)
285    pub fn unicode_ranges(&self) -> Vec<String> {
286        let mut ranges: Vec<String> = range_scan(self.decoded_payload().unwrap_or_default())
287            .iter()
288            .cloned()
289            .collect();
290        ranges.sort_unstable();
291        ranges
292    }
293}
294
295/////////////////////////////////////////////////////////////////////////////////////
296// CharsetMatches
297// Container with every CharsetMatch items ordered by default from most probable
298// to the less one.
299/////////////////////////////////////////////////////////////////////////////////////
300
301#[derive(Debug, Default)]
302pub struct CharsetMatches {
303    items: Vec<CharsetMatch>,
304}
305
306pub struct CharsetMatchesIterMut<'a> {
307    items: std::slice::IterMut<'a, CharsetMatch>,
308}
309
310pub struct CharsetMatchesIter<'a> {
311    items: std::slice::Iter<'a, CharsetMatch>,
312}
313
314impl CharsetMatches {
315    // Initialization method
316    pub fn new(items: Option<Vec<CharsetMatch>>) -> Self {
317        let mut items = items.unwrap_or_default();
318        CharsetMatches::resort(&mut items);
319        CharsetMatches { items }
320    }
321    pub fn from_single(item: CharsetMatch) -> Self {
322        CharsetMatches { items: vec![item] }
323    }
324    // Insert a single match. Will be inserted accordingly to preserve sort.
325    // Can be inserted as a submatch.
326    pub fn append(&mut self, item: CharsetMatch) {
327        // We should disable the submatch factoring when the input file is too heavy
328        // (conserve RAM usage)
329        if item.payload.len() <= TOO_BIG_SEQUENCE {
330            for m in &mut self.items {
331                if m.decoded_payload() == item.decoded_payload()
332                    && (m.mean_mess_ratio - item.mean_mess_ratio).abs() < f32::EPSILON
333                {
334                    m.add_submatch(&item);
335                    return;
336                }
337            }
338        }
339        self.items.push(item);
340        CharsetMatches::resort(&mut self.items);
341    }
342    // Simply return the first match. Strict equivalent to matches[0].
343    pub fn get_best(&self) -> Option<&CharsetMatch> {
344        self.items.first()
345    }
346    // Retrieve a single item either by its position or encoding name (alias may be used here).
347    pub fn get_by_encoding(&self, encoding: &str) -> Option<&CharsetMatch> {
348        let encoding = iana_name(encoding)?;
349        self.items
350            .iter()
351            .find(|&i| i.suitable_encodings().contains(&encoding.to_string()))
352    }
353    // Resort items by relevancy (for internal use)
354    fn resort(items: &mut [CharsetMatch]) {
355        items.sort_unstable();
356    }
357    // iterator
358    pub fn iter_mut(&mut self) -> CharsetMatchesIterMut<'_> {
359        CharsetMatchesIterMut {
360            items: self.items.iter_mut(),
361        }
362    }
363    pub fn iter(&self) -> CharsetMatchesIter<'_> {
364        CharsetMatchesIter {
365            items: self.items.iter(),
366        }
367    }
368    // len
369    pub fn len(&self) -> usize {
370        self.items.len()
371    }
372    // is empty?
373    pub fn is_empty(&self) -> bool {
374        self.items.is_empty()
375    }
376}
377
378impl Index<usize> for CharsetMatches {
379    type Output = CharsetMatch;
380    fn index(&self, index: usize) -> &Self::Output {
381        &self.items[index]
382    }
383}
384
385impl<'a> Iterator for CharsetMatchesIterMut<'a> {
386    type Item = &'a mut CharsetMatch;
387
388    fn next(&mut self) -> Option<Self::Item> {
389        self.items.next()
390    }
391}
392
393impl<'a> Iterator for CharsetMatchesIter<'a> {
394    type Item = &'a CharsetMatch;
395
396    fn next(&mut self) -> Option<Self::Item> {
397        self.items.next()
398    }
399}
400
401#[derive(Clone)]
402pub struct NormalizerSettings {
403    /// How many steps (chunks) should be used from file
404    pub steps: usize,
405    /// Each chunk size
406    pub chunk_size: usize,
407    /// Mess ration threshold
408    pub threshold: OrderedFloat<f32>,
409    /// Specify probing encodings exactly
410    pub include_encodings: Vec<String>,
411    /// Exclude these encodings from probing
412    pub exclude_encodings: Vec<String>,
413    /// Allow try to find charset in the text
414    pub preemptive_behaviour: bool,
415    /// Language detector threshold
416    pub language_threshold: OrderedFloat<f32>,
417    /// Allow fallback to ASCII / UTF-8
418    pub enable_fallback: bool,
419}
420
421impl Default for NormalizerSettings {
422    fn default() -> Self {
423        NormalizerSettings {
424            steps: 5,
425            chunk_size: 512,
426            threshold: OrderedFloat(0.2),
427            include_encodings: vec![],
428            exclude_encodings: vec![],
429            preemptive_behaviour: true,
430            language_threshold: OrderedFloat(0.1),
431            enable_fallback: true,
432        }
433    }
434}