rustrict/
mtch.rs

1use crate::buffer_proxy_iterator::BufferProxyIterator;
2use crate::trie::Node;
3use crate::Type;
4use std::hash::{Hash, Hasher};
5
6#[derive(Clone)]
7pub(crate) struct Match {
8    /// The word being matched.
9    pub node: &'static Node,
10    /// Stores the index in the string when this match was created.
11    pub start: usize,
12    // Stores the index in the string when this match was completed.
13    pub end: usize,
14    /// Stores the last matched character.
15    pub last: char,
16    /// Whether the match was preceded by a separator.
17    pub begin_separate: bool,
18    /// Whether the match was followed by a separator.
19    pub end_separate: bool,
20    /// Stores how many spaces appeared within the match, excluding spaces that directly correspond to the pattern.
21    pub spaces: u8,
22    /// Stores how many characters were skipped.
23    pub skipped: u8,
24    /// Stores how many replacements took place while matching.
25    pub replacements: u8,
26    /// Stores how many extra repretitions took place while matching.
27    pub repetitions: u8,
28    /// Stores how many low-confidence replacements took place while matching.
29    pub low_confidence_replacements: u8,
30}
31
32impl Match {
33    /// Combines in a way that the order of matches doesn't matter.
34    pub(crate) fn combine(&self, other: &Self) -> Self {
35        Self {
36            start: self.start.min(other.start),
37            spaces: self.spaces.min(other.spaces),
38            skipped: self.skipped.min(other.skipped),
39            replacements: self.replacements.min(other.replacements),
40            low_confidence_replacements: self
41                .low_confidence_replacements
42                .min(other.low_confidence_replacements),
43            repetitions: self.repetitions.min(other.repetitions),
44            last: self.last.min(other.last),
45            ..*self
46        }
47    }
48
49    fn confidence(&self) -> i64 {
50        let mut confidence: i64 = 0;
51        confidence += self.node.depth.max(1).ilog2() as i64;
52        confidence += (self.end - self.start).max(1).ilog2() as i64;
53        if self.node.depth == 1 {
54            confidence += 1;
55        } else {
56            if !self.begin_separate {
57                confidence -= 2;
58                if self.node.contains_space {
59                    confidence -= 3;
60                }
61            }
62            if !self.end_separate {
63                confidence -= 1;
64            }
65            if !self.begin_separate && !self.end_separate {
66                confidence -= 1;
67            }
68        }
69        if self.node.typ.is(Type::SEVERE) {
70            confidence += 3;
71        } else if self.node.typ.is(Type::MODERATE_OR_HIGHER)
72            && (self.node.depth == 1 || self.node.typ.isnt(Type::EVASIVE & Type::SEVERE))
73        {
74            confidence += 2
75        } else if self.node.typ.is(Type::MILD_OR_HIGHER)
76            && (self.node.depth == 1
77                || self.node.typ.isnt(Type::EVASIVE & Type::MODERATE_OR_HIGHER))
78        {
79            confidence += 1;
80        };
81        confidence -= (self.skipped as u16 + self.spaces as u16 + self.replacements as u16 + 1)
82            .ilog2() as i64;
83        confidence -= (self.low_confidence_replacements + 1).ilog2() as i64;
84        if self.node.depth == 2 && self.low_confidence_replacements > 0 {
85            // h8
86            confidence -= 2;
87        }
88        if self.node.typ.is(Type::EVASIVE & Type::SEVERE) {
89            confidence -= 3;
90        } else if self.node.typ.is(Type::EVASIVE & Type::MODERATE_OR_HIGHER) {
91            confidence -= 2;
92        } else if self.node.typ.is(Type::EVASIVE & Type::MILD) {
93            confidence -= 1;
94        }
95        confidence
96    }
97
98    /// Returns whether committed.
99    pub(crate) fn commit<I: Iterator<Item = char>>(
100        &self,
101        typ: &mut Type,
102        spy: &mut BufferProxyIterator<I>,
103        censor_threshold: Type,
104        censor_first_character_threshold: Type,
105        censor_replacement: char,
106    ) -> bool {
107        #[cfg(feature = "trace")]
108        print!(
109            "Committing {} with begin_separate={}, spaces={}, skipped={}, end_separate={}, depth={}, replacements={}, lcr={}, contains_space={}: ",
110            self.node.trace,
111            self.begin_separate,
112            self.spaces,
113            self.skipped,
114            self.end_separate,
115            self.node.depth,
116            self.replacements,
117            self.low_confidence_replacements,
118            self.node.contains_space
119        );
120
121        let confidence = self.confidence();
122
123        if confidence <= 0 {
124            #[cfg(feature = "trace")]
125            println!("rejected with confidence {confidence}");
126            return false;
127        }
128        #[cfg(feature = "trace")]
129        println!("accepted with confidence {confidence}");
130
131        /*
132        let too_many_replacements = !(self.begin_separate
133            && (self.end_separate
134                || (self.spaces == 0
135                    && self.node.depth > 2
136                    && self.node.typ.is(Type::MODERATE_OR_HIGHER))))
137            && self.node.depth > 1
138            // In theory, prevents blahsex, but allows blahsexblah.
139            && (!(self.end_separate || self.begin_separate) || self.node.depth < 3 || self.spaces.max(self.skipped).max(self.replacements) > 0 || self.node.typ.isnt(Type::MODERATE_OR_HIGHER))
140            && self.spaces.max(self.skipped).max(self.replacements) as usize + 4 > self.node.depth as usize;
141
142        let low_confidence_replacements = self.low_confidence_replacements > 0
143            && (self.low_confidence_replacements as usize
144                > (self.end - self.start).saturating_sub(1) || self.low_confidence_replacements as usize > (self.end - self.start).max(10) / 5 || self.node.depth < 3)
145            && self.node.depth > 1;
146
147        let low_confidence_short = self.replacements >= self.node.depth
148            && self.node.depth <= 3
149            && !self.node.typ.is(Type::SEVERE);
150
151        // Make it so "squirrels word" doesn't contain "s word"
152        let low_confidence_special = self.node.contains_space && !self.begin_separate;
153
154        if too_many_replacements
155            || low_confidence_replacements
156            || low_confidence_short
157            || low_confidence_special
158        {
159            // Match isn't strong enough.
160            #[cfg(feature = "trace")]
161            println!(
162                "(rejected: {} {} {} {})",
163                too_many_replacements,
164                low_confidence_replacements,
165                low_confidence_short,
166                low_confidence_special
167            );
168            return false;
169        }
170        */
171
172        // Apply detection.
173        *typ |= self.node.typ
174            | if self.replacements >= 2 {
175                Type::EVASIVE & Type::MILD
176            } else {
177                Type::NONE
178            };
179
180        // Decide whether to censor.
181        if self.node.typ.is(censor_threshold) {
182            // Decide whether to censor the first character.
183            let offset =
184                if self.node.typ.is(censor_first_character_threshold) || self.node.depth == 1 {
185                    0
186                } else {
187                    1
188                };
189            spy.censor(self.start + offset..=self.end, censor_replacement);
190        }
191
192        true
193    }
194}
195
196impl PartialEq for Match {
197    fn eq(&self, other: &Self) -> bool {
198        std::ptr::eq(self.node, other.node) && self.begin_separate == other.begin_separate
199    }
200}
201
202impl Eq for Match {}
203
204impl Hash for Match {
205    fn hash<H: Hasher>(&self, state: &mut H) {
206        state.write_usize(self.node as *const _ as usize);
207        state.write_u8(self.begin_separate as u8);
208    }
209}