masker/
lib.rs

1#![cfg_attr(docsrs, feature(doc_cfg))]
2
3//! # masker
4//!
5//! This crate provides an object [`Masker`] which can replace
6//! a set of (potentially overlapping) patterns in input data
7//! with a fixed mask. It's usually faster than simply doing
8//! searches with replacement one by one, and it handles two
9//! awkward cases:
10//! - It allows you to provide the data in chunks, for example
11//!   to mask data as you upload it.
12//! - It handles the case where masked regions blend together, for
13//!   example if two patterns are present and overlap, then replacing
14//!   either of them first leaves characters from the other visible.
15//!
16//! There are the following features:
17#![cfg_attr(
18    feature = "streams",
19    doc = r##"
20 - `"streams"` - Mask [`Stream`] implementations by wrapping them.
21"##
22)]
23#![cfg_attr(
24    not(feature = "streams"),
25    doc = r##"
26 - `"streams"` - Support for masking streams.
27"##
28)]
29#![warn(missing_docs)]
30use core::fmt::{Debug, Error, Formatter};
31use std::collections::BTreeMap;
32
33#[cfg(feature = "streams")]
34use bytes::Bytes;
35#[cfg(feature = "streams")]
36use futures::stream::Stream;
37
38/// A pattern to mask
39///
40/// This consists of an optional fixed prefix, which must match in
41/// sequence, followed by an optional suffix drawn from a given
42/// alphabet. Either the prefix or suffix can be empty. The prefix, if
43/// present, can be masked or preserved.
44#[derive(Clone, Eq, PartialEq)]
45pub struct MatchData<'a> {
46    /// A fixed sequence of bytes to match.
47    ///
48    /// This may be empty, in which case any suffix bytes anywhere
49    /// in the input will be masked.
50    pub prefix: &'a [u8],
51    /// A set of bytes to match after the prefix.
52    ///
53    /// This is not a sequence, we match a sequence of bytes drawn
54    /// from suffix in any order. This may be empty.
55    pub suffix: &'a [u8],
56    /// If true, the prefix itself is masked, if false the prefix is
57    /// preserved.
58    pub mask_prefix: bool,
59}
60
61impl<'a> Debug for MatchData<'a> {
62    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> {
63        write!(
64            f,
65            "[PDATA {:?} {:?}{}]",
66            String::from_utf8_lossy(self.prefix),
67            String::from_utf8_lossy(self.suffix),
68            if self.mask_prefix { " MP" } else { "" },
69        )
70    }
71}
72
73#[derive(Clone, Eq, PartialEq)]
74struct Match<'a, 'b>
75where
76    'b: 'a,
77{
78    data: &'a MatchData<'b>,
79    match_idx: usize,
80    offset: usize,
81}
82
83impl<'a, 'b> Match<'a, 'b> {
84    pub fn new(data: &'a MatchData<'b>, match_idx: usize, offset: usize) -> Self {
85        Self {
86            data,
87            match_idx,
88            offset,
89        }
90    }
91
92    pub fn index(&self) -> usize {
93        self.match_idx
94    }
95
96    pub fn past_offset(&self, offset: &usize) -> bool {
97        self.offset >= *offset
98    }
99
100    pub fn allowed_next(&self) -> &'_ [u8] {
101        if self.offset < self.data.prefix.len() {
102            &self.data.prefix[self.offset..self.offset + 1]
103        } else {
104            self.data.suffix
105        }
106    }
107
108    pub fn try_next(&self, action: u8) -> (Option<Self>, Option<(usize, usize)>) {
109        if self.offset < self.data.prefix.len() {
110            if action == self.data.prefix[self.offset] {
111                let offset = self.offset + 1;
112                let span = (self.data.mask_prefix && offset == self.data.prefix.len())
113                    .then_some((self.data.prefix.len(), 0));
114                (Some(Match::new(self.data, self.match_idx, offset)), span)
115            } else {
116                (None, None)
117            }
118        } else if self.data.suffix.contains(&action) {
119            if !self.data.prefix.is_empty() {
120                // distinguish having matched the prefix from having matched the
121                // prefix AND having matched something in the suffix
122                let offset = std::cmp::min(self.offset + 1, self.data.prefix.len() + 2);
123                let span = if self.data.mask_prefix && !self.data.prefix.is_empty() {
124                    Some((2, 0))
125                } else {
126                    Some((offset - self.data.prefix.len(), 0))
127                };
128                (Some(Match::new(self.data, self.match_idx, offset)), span)
129            } else {
130                // We're matching bare characters, so there's no
131                // prefix check, and no interesting state to keep
132                (None, Some((1, 0)))
133            }
134        } else {
135            (None, None)
136        }
137    }
138
139    /// How many characters back could potentially mask
140    pub fn prefix_length(&self) -> usize {
141        let pfx = if self.data.mask_prefix {
142            self.offset
143        } else {
144            0
145        };
146
147        if self.offset < self.data.prefix.len() + 1 {
148            pfx
149        } else {
150            pfx + (self.offset - self.data.prefix.len())
151        }
152    }
153}
154
155impl<'a, 'b> Debug for Match<'a, 'b> {
156    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> {
157        write!(
158            f,
159            "[PREFIX '{}' '{}' {}]",
160            String::from_utf8_lossy(self.data.prefix),
161            String::from_utf8_lossy(self.data.suffix),
162            self.offset
163        )
164    }
165}
166
167#[derive(Clone, Default, PartialEq, Eq)]
168struct State<'a, 'b> {
169    matches: Vec<Match<'a, 'b>>,
170    text: Vec<u8>,
171    spans: Vec<(usize, usize)>,
172    text_offset: usize,
173}
174
175impl<'a, 'b> State<'a, 'b> {
176    fn new(
177        matches: Vec<Match<'a, 'b>>,
178        text: Vec<u8>,
179        spans: Vec<(usize, usize)>,
180        text_offset: usize,
181    ) -> Self {
182        Self {
183            matches,
184            text,
185            spans,
186            text_offset,
187        }
188    }
189
190    fn generate_actions(&self, datas: &[MatchData]) -> Vec<Option<u8>> {
191        let mut res = Vec::new();
192        for pfx in self.matches.iter() {
193            for ch in pfx.allowed_next() {
194                if !res.contains(&Some(*ch)) {
195                    res.push(Some(*ch));
196                }
197            }
198        }
199
200        for data in datas.iter() {
201            if !data.prefix.is_empty() {
202                let ch = data.prefix[0];
203                if !res.contains(&Some(ch)) {
204                    res.push(Some(ch))
205                }
206            } else {
207                for ch in data.suffix.as_ref().iter() {
208                    if !res.contains(&Some(*ch)) {
209                        res.push(Some(*ch))
210                    }
211                }
212            }
213        }
214
215        res.push(None);
216        res
217    }
218}
219
220impl<'a, 'b> std::fmt::Debug for State<'a, 'b> {
221    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> {
222        write!(
223            f,
224            "[STATE '{}' {:?}",
225            String::from_utf8_lossy(&self.text),
226            self.spans
227        )?;
228        for s in self.matches.iter() {
229            write!(f, " {:?}", s)?;
230        }
231        write!(f, " TxtOff: {}", self.text_offset)?;
232        write!(f, "]")
233    }
234}
235
236#[derive(Clone, Default, PartialEq, Eq, Ord, PartialOrd)]
237struct Link {
238    source: usize,
239    target: usize,
240    action: u8,
241    emitted: Option<Vec<u8>>,
242}
243
244impl Link {
245    pub fn new(source: usize, target: usize, action: u8, emitted: Option<Vec<u8>>) -> Self {
246        Self {
247            source,
248            target,
249            action,
250            emitted,
251        }
252    }
253}
254
255impl Debug for Link {
256    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> {
257        write!(
258            f,
259            "[LINK {} -> {} '{}' ({}){}]",
260            self.source,
261            self.target,
262            char::from_u32(self.action as u32).unwrap_or('?'),
263            self.action,
264            if let Some(emitted) = &self.emitted {
265                format!(r#" "{}""#, String::from_utf8_lossy(emitted))
266            } else {
267                String::new()
268            }
269        )
270    }
271}
272
273#[derive(Debug, Clone, Copy)]
274struct LinkKey(usize);
275
276#[derive(Clone, Debug)]
277struct Links {
278    source_offset: Vec<usize>,
279    target: Vec<usize>,
280    actions: Vec<u8>,
281    emitted: Vec<Option<Vec<u8>>>,
282}
283
284impl Links {
285    pub fn new(mut links: Vec<Link>) -> Self {
286        links.sort_by(|a, b| a.source.cmp(&b.source));
287        let mut source_offset = Vec::new();
288        let mut target = Vec::new();
289        let mut actions = Vec::new();
290        let mut emitted = Vec::new();
291        let mut prev_state = 0;
292        source_offset.push(0);
293        for link in links {
294            while prev_state < link.source {
295                source_offset.push(target.len());
296                prev_state += 1;
297            }
298            target.push(link.target);
299            actions.push(link.action);
300            emitted.push(link.emitted);
301        }
302        source_offset.push(target.len());
303        Self {
304            source_offset,
305            target,
306            actions,
307            emitted,
308        }
309    }
310
311    pub fn get(&self, state: usize, action: u8) -> Option<LinkKey> {
312        let start = self.source_offset[state];
313        let end = self.source_offset[state + 1];
314        for i in start..end {
315            if self.actions[i] == action {
316                return Some(LinkKey(i));
317            }
318        }
319        None
320    }
321
322    pub fn target(&self, key: LinkKey) -> usize {
323        self.target[key.0]
324    }
325
326    pub fn emitted(&self, key: LinkKey) -> Option<&Vec<u8>> {
327        self.emitted[key.0].as_ref()
328    }
329}
330
331#[derive(Clone, Debug)]
332struct DefaultLinks {
333    emitted: Vec<Option<Vec<u8>>>,
334}
335
336impl DefaultLinks {
337    pub fn new(default_links: BTreeMap<usize, Option<Vec<u8>>>) -> Self {
338        let mut emitted = Vec::new();
339        for (k, v) in default_links {
340            if k >= emitted.len() {
341                emitted.resize(k + 1, None);
342            }
343            emitted[k] = v;
344        }
345        Self { emitted }
346    }
347
348    pub fn get(&self, state: usize) -> Option<&Vec<u8>> {
349        self.emitted[state].as_ref()
350    }
351}
352
353fn unify_spans(spans: &[(usize, usize)]) -> Vec<(usize, usize)> {
354    if spans.is_empty() {
355        return Vec::new();
356    }
357
358    let mut buf = spans.to_vec();
359    buf.sort();
360    let mut res = Vec::new();
361
362    let mut cur_span = buf.first().copied().unwrap();
363    for span in buf.iter() {
364        let new_cur_span = if span.0 < cur_span.1 {
365            (cur_span.0, std::cmp::max(cur_span.1, span.1))
366        } else {
367            res.push(cur_span);
368            *span
369        };
370        cur_span = new_cur_span;
371    }
372    res.push(cur_span);
373
374    res
375}
376
377fn mask_spans<'a>(
378    spans: &[(usize, usize)],
379    input: &'a [u8],
380    mask: &[u8],
381    offset: usize,
382) -> Vec<u8> {
383    let mut res = Vec::new();
384    let mut span = 0;
385
386    while span < spans.len() {
387        if spans[span].0 >= offset {
388            break;
389        }
390        res.extend_from_slice(mask);
391        if spans[span].1 > offset {
392            break;
393        }
394        span += 1;
395    }
396
397    for (i, ch) in input.iter().enumerate().map(|(i, ch)| (i + offset, ch)) {
398        if span == spans.len() || i < spans[span].0 {
399            res.push(*ch);
400        } else {
401            if i == spans[span].0 {
402                res.extend_from_slice(mask);
403            }
404            if i + 1 == spans[span].1 {
405                span += 1;
406            }
407        }
408    }
409    res
410}
411
412/// Replace byte sequences with a chosen mask value
413///
414/// This is the central object of this crate. When creating one, an
415/// FSM is constructed that can run over a block of data, byte by
416/// byte, and replace a set of pre-selected patterns by a mask
417/// pattern.
418///
419/// You can provide all the data up front, using
420/// [`mask_slice`](Masker::mask_slice) and
421/// [`mask_str`](Masker::mask_str), or you can opt to stream your data
422/// through it using [`mask_chunks`](Masker::mask_chunks).
423///
424/// Example:
425/// ```rust
426/// use masker::Masker;
427///
428/// let m = Masker::new(&["frog", "cat"], "XXXX");
429/// let s = m.mask_str("the bad frog sat on the cat");
430/// assert_eq!(s.as_str(), "the bad XXXX sat on the XXXX");
431/// ```
432#[derive(Clone, Debug)]
433pub struct Masker {
434    links: Links,
435    default_links: DefaultLinks,
436}
437
438impl Masker {
439    /// Create a new masker
440    ///
441    /// `input_data` are the things you want to mask in any data
442    /// that is given to the masker. `mask` is the replacement you
443    /// want instead of those input data.
444    ///
445    /// This builds a finite state machine capable of processing
446    /// the given input data and mask, and has non-negligible cost,
447    /// similiar to a regex compilation.
448    ///
449    /// Note that it is permissible for the input data to have overlaps,
450    /// e.g. the case where you mask "cater" and "bobcat" is handled.
451    ///
452    /// Example:
453    /// ```rust
454    /// use masker::Masker;
455    ///
456    /// let m = Masker::new(&["cater", "bobcat"], "XXXX");
457    /// let s = m.mask_str("what does bobcater do?");
458    /// assert_eq!(s.as_str(), "what does XXXX do?");
459    /// ```
460    pub fn new<S, T>(input_data: &[S], mask: T) -> Masker
461    where
462        S: AsRef<[u8]>,
463        T: AsRef<[u8]>,
464    {
465        Self::new_with_match_data(input_data, &[], mask)
466    }
467
468    /// Create a new masker with prefix support
469    ///
470    /// `input_data` are the things you want to mask in any data
471    /// that is given to the masker. `mask` is the replacement you
472    /// want instead of those input data.
473    ///
474    /// `match_data` is a slice of [`MatchData`] objects that describe
475    /// some prefix/suffix pairs you would like to mask. This is
476    /// strictly a superset of the capabilities provided in `input_data`,
477    /// it's just less convenient to construct. Any entry in `input_data`
478    /// is equivalent to a [`MatchData`] with the prefix set to the value,
479    /// suffix empty, amd `mask_prefix` set to `true`.
480    ///
481    /// This builds a finite state machine capable of processing
482    /// the given inputs and mask, and has non-negligible cost,
483    /// similiar to a regex compilation.
484    ///
485    /// Note that it is permissible for the input data to have
486    /// overlaps, e.g. the case where you mask "cater" and "bobcat" is
487    /// handled. This extends to the case of token prefixes.
488    ///
489    /// Example:
490    /// ```rust
491    /// use masker::{Masker, MatchData};
492    ///
493    /// let input_data: &[&str] = &[];
494    /// let match_data = &[ MatchData { prefix: "secret".as_ref(), suffix: "abc".as_ref(), mask_prefix: true } ];
495    /// let m = Masker::new_with_match_data(input_data, match_data, "XXXX".as_bytes());
496    /// let s = m.mask_str("what does secretbaaa do?");
497    /// assert_eq!(s.as_str(), "what does XXXX do?");
498    /// ```
499    pub fn new_with_match_data<S, T>(input_data: &[S], match_data: &[MatchData], mask: T) -> Masker
500    where
501        S: AsRef<[u8]>,
502        T: AsRef<[u8]>,
503    {
504        let prefix_data = input_data
505            .iter()
506            .map(|s| MatchData {
507                prefix: s.as_ref(),
508                suffix: &[],
509                mask_prefix: true,
510            })
511            .chain(match_data.iter().cloned())
512            .collect::<Vec<_>>();
513
514        let mut states: Vec<State<'_, '_>> = vec![Default::default()];
515        let mut links = Vec::new();
516        let mut default_links = BTreeMap::new();
517        let mut work = vec![0usize];
518
519        let mut coverage = BTreeMap::new();
520        for d1 in 0..prefix_data.len() {
521            for d2 in 0..prefix_data.len() {
522                let mut failed = false;
523                // First try to match d2's prefix, starting at this point
524                for j in 0..prefix_data[d2].prefix.len() {
525                    if !prefix_data[d1].suffix.contains(&prefix_data[d2].prefix[j]) {
526                        failed = true;
527                        break;
528                    }
529                }
530                if failed {
531                    continue;
532                }
533                for ch in prefix_data[d2].suffix {
534                    if !prefix_data[d1].suffix.contains(ch) {
535                        failed = true;
536                        break;
537                    }
538                }
539                if failed {
540                    continue;
541                }
542                // So now they cover
543                coverage.insert((d1, d2), prefix_data[d1].prefix.len() + 1);
544            }
545        }
546
547        while let Some(index) = work.pop() {
548            let actions = states[index].generate_actions(&prefix_data);
549
550            for action in actions {
551                // STEP 1: Find the new matches, and spans of completed matches
552                let mut new_matches = Vec::new();
553                let mut new_spans = states[index].spans.to_vec();
554                let new_text = {
555                    let mut t = states[index].text.clone();
556                    if let Some(action) = action {
557                        t.push(action);
558                    }
559                    t
560                };
561                let text_offset = states[index].text_offset;
562                let full_text_len = new_text.len() + text_offset;
563
564                if let Some(action) = action {
565                    for pfx in states[index].matches.iter() {
566                        let (pfx, span) = pfx.try_next(action);
567                        if let Some(new_pfx) = pfx {
568                            if !new_matches.contains(&new_pfx) {
569                                new_matches.push(new_pfx);
570                            }
571                        }
572                        if let Some((s1, s2)) = span {
573                            new_spans.push((
574                                full_text_len - std::cmp::min(full_text_len, s1),
575                                full_text_len - std::cmp::min(full_text_len, s2),
576                            ));
577                        }
578                    }
579
580                    for (ix, data) in prefix_data.iter().enumerate() {
581                        let mut covered = false;
582                        for pfx in states[index].matches.iter() {
583                            if let Some(start) = coverage.get(&(pfx.index(), ix)) {
584                                if pfx.past_offset(start) {
585                                    covered = true;
586                                    break;
587                                }
588                            }
589                        }
590                        if covered {
591                            continue;
592                        }
593
594                        let pfx = Match::new(data, ix, 0);
595                        let (pfx, span) = pfx.try_next(action);
596                        if let Some(new_pfx) = pfx {
597                            if !new_matches.contains(&new_pfx) {
598                                new_matches.push(new_pfx);
599                            }
600                        }
601                        if let Some((s1, s2)) = span {
602                            new_spans.push((
603                                full_text_len - std::cmp::min(full_text_len, s1),
604                                full_text_len - std::cmp::min(full_text_len, s2),
605                            ));
606                        }
607                    }
608                }
609
610                // STEP 2: Find the emitted text, based on the spans present
611                let unified_spans = unify_spans(&new_spans);
612                let mut emitted_spans = Vec::new();
613                let mut kept_spans = Vec::new();
614                // Keep any char that may be in the extent of a new span
615                let new_extent = new_matches
616                    .iter()
617                    .map(|m| m.prefix_length())
618                    .max()
619                    .unwrap_or(0usize);
620                let mut first_kept_char = full_text_len - std::cmp::min(full_text_len, new_extent);
621
622                for (x1, x2) in unified_spans {
623                    // This span does not overlap with any span we are
624                    // building
625                    if x2 + new_extent <= full_text_len {
626                        emitted_spans.push((x1, x2));
627                    } else {
628                        kept_spans.push((x1, x2));
629                        // This span overlaps some new span, which means we
630                        // may need to keep this text too
631                        first_kept_char = std::cmp::min(first_kept_char, x1);
632                    }
633                }
634
635                let emitted_text = if first_kept_char > 0 {
636                    let s = mask_spans(
637                        &emitted_spans,
638                        &new_text[0..(first_kept_char - text_offset)],
639                        mask.as_ref(),
640                        text_offset,
641                    );
642                    if !s.is_empty() {
643                        Some(s)
644                    } else {
645                        None
646                    }
647                } else {
648                    None
649                };
650
651                // Prune all emitted text
652                let (new_text, new_text_offset) = if first_kept_char > text_offset {
653                    (&new_text[(first_kept_char - text_offset)..], 0)
654                } else {
655                    (new_text.as_slice(), text_offset - first_kept_char)
656                };
657
658                // Rebase spans for pruning
659                let mut kept_spans = kept_spans
660                    .into_iter()
661                    .map(|(a, b)| (a - first_kept_char, b - first_kept_char))
662                    .collect::<Vec<_>>();
663                kept_spans.sort_by(|a: &(usize, usize), b: &(usize, usize)| a.0.cmp(&b.0));
664
665                // STEP 3: Clear any left-anchored text that will be masked,
666                //         replacing it with an empty span to mark the spot
667                let cleared = if let Some(first_span) = kept_spans.first().copied() {
668                    if first_span.0 == 0 && first_span.1 > 0 {
669                        first_span.1
670                    } else {
671                        0
672                    }
673                } else {
674                    0
675                };
676
677                let (new_text, new_text_offset) = if cleared > 0 {
678                    if cleared > new_text_offset {
679                        (&new_text[(cleared - new_text_offset)..], 1)
680                    } else {
681                        (new_text, new_text_offset - cleared + 1)
682                    }
683                } else {
684                    (new_text, 0)
685                };
686                let kept_spans = if cleared > 0 {
687                    kept_spans
688                        .into_iter()
689                        .map(|(a, b)| {
690                            (
691                                a - std::cmp::min(a, cleared - 1),
692                                b - std::cmp::min(b, cleared - 1),
693                            )
694                        })
695                        .collect::<Vec<_>>()
696                } else {
697                    kept_spans
698                };
699
700                let new_state =
701                    State::new(new_matches, new_text.to_vec(), kept_spans, new_text_offset);
702
703                let new_index = if let Some(new_index) = states.iter().position(|x| x == &new_state)
704                {
705                    new_index
706                } else {
707                    let new_index = states.len();
708                    states.push(new_state);
709                    work.push(new_index);
710                    new_index
711                };
712
713                if let Some(action) = action {
714                    let lnk = Link::new(index, new_index, action, emitted_text);
715                    links.push(lnk);
716                } else {
717                    default_links.insert(index, emitted_text);
718                }
719            }
720        }
721
722        Self {
723            links: Links::new(links),
724            default_links: DefaultLinks::new(default_links),
725        }
726    }
727
728    /// Apply masking to a slice of data
729    ///
730    /// All patterns and overlaps found within the given data block
731    /// will be replaced with the previously chosen mask value.
732    ///
733    /// Example
734    /// ```rust
735    /// use masker::Masker;
736    ///
737    /// let m = Masker::new(&["frog", "cat"], "XXXX");
738    /// let v = m.mask_slice("the bad frog sat on the cat".as_bytes());
739    /// assert_eq!(v.as_slice(), "the bad XXXX sat on the XXXX".as_bytes());
740    /// ```
741    pub fn mask_slice<S>(&self, input: S) -> Vec<u8>
742    where
743        S: AsRef<[u8]>,
744    {
745        let mut state = 0usize;
746        let mut res = Vec::new();
747        res.reserve(input.as_ref().len());
748        for ch in input.as_ref().iter() {
749            if let Some(link) = self.links.get(state, *ch) {
750                if let Some(emitted) = self.links.emitted(link) {
751                    res.extend(emitted);
752                }
753                state = self.links.target(link);
754            } else {
755                if let Some(emitted) = self.default_links.get(state) {
756                    res.extend(emitted);
757                }
758                res.push(*ch);
759                state = 0;
760            }
761        }
762        if let Some(emitted) = self.default_links.get(state) {
763            res.extend(emitted);
764        }
765        res
766    }
767
768    /// Apply masking to text
769    ///
770    /// All patterns and overlaps found within the given data block
771    /// will be replace with the previously chosen mask value.
772    ///
773    /// This is simply a convenience wrapper over
774    /// [`mask_slice`](Masker::mask_slice).
775    pub fn mask_str<S>(&self, input: S) -> String
776    where
777        S: AsRef<str>,
778    {
779        String::from_utf8(self.mask_slice(input.as_ref())).unwrap()
780    }
781
782    /// Begin masking a stream of data chunks
783    ///
784    /// The return value is an object that will handle the sequence.
785    /// It has to be this way because we need to cope with masking
786    /// over chunk boundaries, and for that reason there can be some
787    /// limited buffering introduced here.
788    ///
789    /// Example:
790    /// ```rust
791    /// use masker::Masker;
792    ///
793    /// let m = Masker::new(&["frog", "cat"], "XXXX");
794    /// let mut cm = m.mask_chunks();
795    /// let mut v = Vec::new();
796    /// v.extend(cm.mask_chunk("the ba"));
797    /// v.extend(cm.mask_chunk("d f"));
798    /// v.extend(cm.mask_chunk("rog sat on the c"));
799    /// v.extend(cm.mask_chunk("at"));
800    /// v.extend(cm.finish());
801    ///
802    /// assert_eq!(v.as_slice(), "the bad XXXX sat on the XXXX".as_bytes());
803    /// ```
804    pub fn mask_chunks(&self) -> ChunkMasker<'_> {
805        ChunkMasker::new(self)
806    }
807
808    /// Wrap a [`Stream`] producing a new one whose content is masked.
809    ///
810    /// The error type of the underlying stream is preserved, since
811    /// the [`Masker`] does not produce its own errors. This is only
812    /// available on the crate feature `streams`.
813    #[cfg(feature = "streams")]
814    pub fn mask_stream<S, E>(&self, stream: S) -> streams::MaskedStream<'_, S, E>
815    where
816        S: Stream<Item = Result<Bytes, E>> + Unpin,
817    {
818        streams::MaskedStream::new(stream, self)
819    }
820}
821
822/// Mask a sequence of data blocks
823///
824/// This is the return value from [`Masker::mask_chunks`] and cannot
825/// be constructed directly. It expects data to be fed into it
826/// sequentially and will correctly mask patterns that cross chunk
827/// boundaries.
828pub struct ChunkMasker<'a> {
829    owner: &'a Masker,
830    state: usize,
831}
832
833impl<'a> ChunkMasker<'a> {
834    fn new(owner: &'a Masker) -> Self {
835        Self { owner, state: 0 }
836    }
837
838    /// Process the next block of data
839    ///
840    /// Not all the output from this block will necessarily be
841    /// produced as a result of this call; some buffering will be
842    /// required if there is a possible match that extends past the
843    /// end of the chunk.  Similarly, some output may be produced from
844    /// previous chunks, if buffering was introduced there.
845    pub fn mask_chunk<C>(&mut self, chunk: C) -> Vec<u8>
846    where
847        C: AsRef<[u8]>,
848    {
849        let mut res = Vec::new();
850        res.reserve(chunk.as_ref().len());
851        for ch in chunk.as_ref().iter() {
852            if let Some(link) = self.owner.links.get(self.state, *ch) {
853                if let Some(emitted) = self.owner.links.emitted(link) {
854                    res.extend(emitted);
855                }
856                self.state = self.owner.links.target(link);
857            } else {
858                if let Some(emitted) = self.owner.default_links.get(self.state) {
859                    res.extend(emitted);
860                }
861                res.push(*ch);
862                self.state = 0;
863            }
864        }
865        res
866    }
867
868    /// Finish streaming and flush buffers
869    ///
870    /// This indicates that there is no further data to come, so any
871    /// potential partial matches that have led to buffering can be
872    /// resolved, and output produced for them.
873    pub fn finish(self) -> Vec<u8> {
874        let mut res = Vec::new();
875        if let Some(emitted) = self.owner.default_links.get(self.state) {
876            res.extend(emitted);
877        }
878        res
879    }
880}
881
882#[cfg(feature = "streams")]
883mod streams {
884    use super::{ChunkMasker, Masker};
885
886    use bytes::Bytes;
887    use core::task::Poll;
888    use futures::Stream;
889
890    pub struct MaskedStream<'a, S, E>
891    where
892        S: Stream<Item = Result<Bytes, E>> + Unpin,
893    {
894        base: S,
895        mask: Option<ChunkMasker<'a>>,
896        completed: bool,
897    }
898
899    impl<'a, S, E> MaskedStream<'a, S, E>
900    where
901        S: Stream<Item = Result<Bytes, E>> + Unpin,
902    {
903        pub fn new(base: S, masker: &'a Masker) -> Self {
904            Self {
905                base,
906                mask: Some(masker.mask_chunks()),
907                completed: false,
908            }
909        }
910    }
911
912    impl<'a, S, E> Stream for MaskedStream<'a, S, E>
913    where
914        S: Stream<Item = Result<Bytes, E>> + Unpin,
915    {
916        type Item = Result<Bytes, E>;
917
918        fn poll_next(
919            self: std::pin::Pin<&mut Self>,
920            cx: &mut std::task::Context<'_>,
921        ) -> std::task::Poll<Option<Self::Item>> {
922            let me = self.get_mut();
923            loop {
924                match me.completed {
925                    true => {
926                        return Poll::Ready(None);
927                    }
928                    false => match core::pin::Pin::new(&mut me.base).poll_next(cx) {
929                        Poll::Ready(Some(Ok(bytes))) => {
930                            let b = me.mask.as_mut().unwrap().mask_chunk(bytes);
931                            if !b.is_empty() {
932                                return Poll::Ready(Some(Ok(b.into())));
933                            }
934                        }
935                        Poll::Ready(Some(Err(e))) => {
936                            return Poll::Ready(Some(Err(e)));
937                        }
938                        Poll::Ready(None) => {
939                            me.completed = true;
940                            let b = me.mask.take().unwrap().finish();
941                            if !b.is_empty() {
942                                return Poll::Ready(Some(Ok(b.into())));
943                            }
944                        }
945                        Poll::Pending => {
946                            return Poll::Pending;
947                        }
948                    },
949                }
950            }
951        }
952    }
953}
954
955#[cfg(test)]
956mod test {
957    use super::{Masker, MatchData};
958    use rand::rngs::StdRng;
959    use rand::{Rng, SeedableRng};
960
961    fn slow_union(input: &[(usize, usize)]) -> Vec<(usize, usize)> {
962        let mut buf1 = Vec::from(input);
963        let mut buf2 = Vec::new();
964        let mut changes = true;
965        while changes {
966            changes = false;
967            for i in 0..buf1.len() {
968                for j in (i + 1)..buf1.len() {
969                    let x1 = std::cmp::max(buf1[i].0, buf1[j].0);
970                    let x2 = std::cmp::min(buf1[i].1, buf1[j].1);
971                    if x1 < x2 {
972                        // overlap
973                        for b in buf1.iter().take(i) {
974                            buf2.push(*b);
975                        }
976                        buf2.push((
977                            std::cmp::min(buf1[i].0, buf1[j].0),
978                            std::cmp::max(buf1[i].1, buf1[j].1),
979                        ));
980                        for b in buf1.iter().take(j).skip(i + 1) {
981                            buf2.push(*b);
982                        }
983                        for b in buf1.iter().skip(j + 1) {
984                            buf2.push(*b);
985                        }
986                        std::mem::swap(&mut buf1, &mut buf2);
987                        buf2.clear();
988                        changes = true;
989                        break;
990                    }
991                }
992                if changes {
993                    break;
994                }
995            }
996        }
997        buf1.sort_by(|a, b| a.0.cmp(&b.0));
998        buf1
999    }
1000
1001    #[test]
1002    fn test_union() {
1003        let mut rng = StdRng::seed_from_u64(0xdeadbeefabadcafe);
1004        for _ in 0..2000000 {
1005            let mut spans = Vec::new();
1006            let count = rng.gen_range(1..20);
1007            for _ in 0..count {
1008                let x1 = rng.gen_range(0..50);
1009                let x2 = x1 + rng.gen_range(1..20);
1010                spans.push((x1, x2));
1011            }
1012            let mut value = super::unify_spans(&spans);
1013            let mut check = slow_union(&spans);
1014            value.sort();
1015            check.sort();
1016            assert_eq!(value, check);
1017        }
1018    }
1019
1020    fn mask_string_check<S: AsRef<str>>(string: &str, mask: &str, keys: &[S]) -> String {
1021        let spans = {
1022            let mut spans = Vec::new();
1023            for key in keys.iter() {
1024                let mut offset = 0usize;
1025                while let Some(ix) = string[offset..].find(key.as_ref()) {
1026                    let len = key.as_ref().as_bytes().len();
1027                    spans.push((offset + ix, offset + ix + len));
1028                    offset += ix + 1;
1029                }
1030            }
1031            spans
1032        };
1033
1034        let mut unioned_spans = super::unify_spans(&spans);
1035        unioned_spans.sort();
1036
1037        let mut offset = 0usize;
1038        let mut res = Vec::new();
1039        for span in unioned_spans {
1040            if offset < span.0 {
1041                res.extend_from_slice(&string.as_bytes()[offset..span.0]);
1042            }
1043            res.extend_from_slice(mask.as_bytes());
1044            offset = span.1;
1045        }
1046        if offset < string.as_bytes().len() {
1047            res.extend_from_slice(&string.as_bytes()[offset..]);
1048        }
1049        String::from_utf8_lossy(&res).into()
1050    }
1051
1052    fn random_string<R: Rng>(mut rng: R, len: usize) -> String {
1053        let mut res = String::new();
1054        for _ in 0..len {
1055            let ch = rng.gen_range('a'..'e');
1056            res.push(ch);
1057        }
1058        res
1059    }
1060
1061    fn random_buffer<R: Rng>(mut rng: R, len: usize) -> Vec<u8> {
1062        let mut res = Vec::new();
1063        res.resize(len, 0);
1064        for ch in res.iter_mut() {
1065            *ch = rng.gen_range(0x61..0x7a);
1066        }
1067        rng.fill_bytes(res.as_mut());
1068        res
1069    }
1070
1071    fn random_input<R: Rng>(mut rng: R, keys: &Vec<String>, len: usize) -> String {
1072        let mut res = String::new();
1073        // Because we require the chunks not contain any keys, and we
1074        // have a limited alphabet, making max_chunk large makes the
1075        // tests run very slowly, and in some cases can essentially
1076        // stall.
1077        let max_chunk = std::cmp::min(5, (len / 4) + 1);
1078        let mut stage = 0;
1079        assert!(max_chunk > 0);
1080        while res.len() < len {
1081            if stage == 0 {
1082                let len = rng.gen_range(1..(max_chunk + 1));
1083                if len > 0 {
1084                    let mut remaining = 1000;
1085                    let chunk = loop {
1086                        let chunk = random_string(&mut rng, len);
1087                        if !keys.iter().any(|k| chunk.contains(k)) {
1088                            break chunk;
1089                        }
1090                        remaining -= 1;
1091                        if remaining == 0 {
1092                            break String::new();
1093                        }
1094                    };
1095                    res.push_str(&chunk);
1096                }
1097            } else if !keys.is_empty() {
1098                let key = rng.gen_range(0..keys.len());
1099                res.push_str(&keys[key]);
1100            }
1101            stage = 1 - stage;
1102        }
1103        res
1104    }
1105
1106    #[test]
1107    fn test_masker() {
1108        let m = Masker::new(&["abcd", "1ab", "cde", "bce", "aa"], "-MASKED-");
1109        assert_eq!(m.mask_str("1abcdef"), "-MASKED-f".to_string());
1110        assert_eq!(m.mask_str("1a"), "1a".to_string());
1111        assert_eq!(m.mask_str("qqcdeblah"), "qq-MASKED-blah");
1112    }
1113
1114    #[test]
1115    fn test_masker_random() {
1116        let mut rng = StdRng::seed_from_u64(0xdeadbeefabadcafe);
1117        for _ in 0..2000 {
1118            let num_keys = rng.gen_range(0..5);
1119            let mut keys = Vec::new();
1120            for _ in 0..num_keys {
1121                let len = rng.gen_range(1..6);
1122                keys.push(random_string(&mut rng, len));
1123            }
1124
1125            let m = Masker::new(&keys, "X");
1126
1127            for _ in 0..1000 {
1128                let len = rng.gen_range(0..100);
1129                let input = random_input(&mut rng, &keys, len);
1130                let output_as_string = m.mask_str(&input);
1131                let check = mask_string_check(&input, "X", &keys);
1132                for key in keys.iter() {
1133                    assert!(
1134                        !output_as_string.contains(key),
1135                        "Key {} is contained in output {}",
1136                        key,
1137                        output_as_string
1138                    );
1139                }
1140                assert_eq!(output_as_string, check);
1141            }
1142        }
1143    }
1144
1145    fn slice_contains_slice(haystack: &[u8], needle: &[u8]) -> bool {
1146        haystack
1147            .windows(needle.len())
1148            .any(|window| window == needle)
1149    }
1150
1151    fn add_separate_keys<R: Rng, S: AsRef<[u8]>>(
1152        mut rng: R,
1153        keys: &[S],
1154        buf: &mut Vec<u8>,
1155        gap: usize,
1156    ) -> usize {
1157        let mut offset = 0;
1158        let mut keys_added = 0;
1159        loop {
1160            let step = rng.gen_range((gap / 2)..gap);
1161            let key = &keys[rng.gen_range(0..keys.len())];
1162            offset += step;
1163            if offset >= buf.len() {
1164                break;
1165            }
1166            let end = std::cmp::min(buf.len(), offset + key.as_ref().len());
1167            let len = end - offset;
1168            buf[offset..(len + offset)].copy_from_slice(&key.as_ref()[..len]);
1169            keys_added += 1;
1170            offset += len;
1171        }
1172        keys_added
1173    }
1174
1175    fn add_random_keys<R: Rng, S: AsRef<[u8]>>(
1176        mut rng: R,
1177        keys: &[S],
1178        buf: &mut Vec<u8>,
1179        count: usize,
1180    ) -> usize {
1181        for _ in 0..count {
1182            let key = &keys[rng.gen_range(0..keys.len())];
1183            let offset = rng.gen_range(0..buf.len());
1184            let end = std::cmp::min(buf.len(), offset + key.as_ref().len());
1185            let len = end - offset;
1186            buf[offset..(len + offset)].copy_from_slice(&key.as_ref()[..len]);
1187        }
1188        count
1189    }
1190
1191    #[allow(dead_code)]
1192    fn diff_buffers<A: AsRef<[u8]>, B: AsRef<[u8]>>(a: A, b: B) -> bool {
1193        let len = std::cmp::min(a.as_ref().len(), b.as_ref().len());
1194        let mut offset = None;
1195        for i in 0..len {
1196            if a.as_ref()[i] != b.as_ref()[i] {
1197                offset = Some(i);
1198                break;
1199            }
1200        }
1201        if let Some(offset) = offset {
1202            println!("A   B   {}", offset);
1203            let start = if offset < 100 { 0 } else { offset - 100 };
1204            let end = if offset + 100 > len {
1205                len
1206            } else {
1207                offset + 100
1208            };
1209            for i in start..end {
1210                println!(
1211                    "{:03} {:03}{}",
1212                    a.as_ref()[i],
1213                    b.as_ref()[i],
1214                    if i == offset { " *" } else { "" }
1215                );
1216            }
1217            return false;
1218        } else if a.as_ref().len() > b.as_ref().len() {
1219            println!("A   B   {}", b.as_ref().len());
1220            for i in b.as_ref().len()..a.as_ref().len() {
1221                println!("{:03} ---", a.as_ref()[i]);
1222            }
1223            return false;
1224        } else if a.as_ref().len() < b.as_ref().len() {
1225            println!("A   B   {}", a.as_ref().len());
1226            for i in a.as_ref().len()..b.as_ref().len() {
1227                println!("--- {:03}", b.as_ref()[i]);
1228            }
1229            return false;
1230        }
1231        true
1232    }
1233
1234    fn mask_slice_check<S, T, U>(input: S, mask: T, keys: &[U]) -> Vec<u8>
1235    where
1236        S: AsRef<[u8]>,
1237        T: AsRef<[u8]>,
1238        U: AsRef<[u8]>,
1239    {
1240        let spans = {
1241            let mut spans = Vec::new();
1242            for key in keys.iter() {
1243                for ix in input
1244                    .as_ref()
1245                    .windows(key.as_ref().len())
1246                    .enumerate()
1247                    .filter(|(_, window)| window == &key.as_ref())
1248                    .map(|(index, _)| index)
1249                {
1250                    let len = key.as_ref().len();
1251                    spans.push((ix, ix + len));
1252                }
1253            }
1254            spans
1255        };
1256
1257        let mut unioned_spans = super::unify_spans(&spans);
1258        unioned_spans.sort();
1259
1260        let mut offset = 0usize;
1261        let mut res = Vec::new();
1262        for span in unioned_spans {
1263            if offset < span.0 {
1264                res.extend_from_slice(&input.as_ref()[offset..span.0]);
1265            }
1266            res.extend_from_slice(mask.as_ref());
1267            offset = span.1;
1268        }
1269        if offset < input.as_ref().len() {
1270            res.extend_from_slice(&input.as_ref()[offset..]);
1271        }
1272        res
1273    }
1274
1275    #[test]
1276    fn test_masker_slabs() {
1277        let mut rng = StdRng::seed_from_u64(0xdeadbeefabadcafe);
1278        for input_type in 0..4 {
1279            for _ in 0..2 {
1280                let num_keys = rng.gen_range(1..15);
1281                let mut keys = Vec::new();
1282                for _ in 0..num_keys {
1283                    let len = rng.gen_range(10..50);
1284                    keys.push(random_buffer(&mut rng, len));
1285                }
1286
1287                let m = Masker::new(&keys, "XXXX-XXXX-XXXX-XXXX");
1288
1289                for _ in 0..3 {
1290                    let len = rng.gen_range(5_000_000..100_000_000);
1291                    let mut input = random_buffer(&mut rng, len);
1292                    match input_type {
1293                        0 => 0,
1294                        1 => add_random_keys(&mut rng, &keys, &mut input, 5),
1295                        2 => add_random_keys(&mut rng, &keys, &mut input, 20),
1296                        3 => add_separate_keys(&mut rng, &keys, &mut input, 20000),
1297                        _ => unreachable!(),
1298                    };
1299                    let output = m.mask_slice(&input);
1300                    let check = mask_slice_check(&input, "XXXX-XXXX-XXXX-XXXX", &keys);
1301                    for key in keys.iter() {
1302                        assert!(
1303                            !slice_contains_slice(&output, key),
1304                            "Key {:?} is contained in output",
1305                            key
1306                        );
1307                    }
1308                    for key in keys.iter() {
1309                        assert!(
1310                            !slice_contains_slice(&check, key),
1311                            "Key {:?} is contained in check",
1312                            key
1313                        );
1314                    }
1315                    diff_buffers(&output, &check);
1316                    assert_eq!(output, check);
1317                }
1318            }
1319        }
1320    }
1321
1322    #[test]
1323    fn test_chunk_masker_sanity() {
1324        let m = Masker::new(&["abcd", "1ab", "cde", "bce", "aa"], "-MASK-");
1325        let mut cm = m.mask_chunks();
1326        assert_eq!(cm.mask_chunk("ab"), Vec::new());
1327        assert_eq!(cm.mask_chunk("c"), Vec::new());
1328        assert_eq!(cm.mask_chunk("d"), Vec::new());
1329        assert_eq!(cm.mask_chunk("g"), Vec::from("-MASK-g".as_bytes()));
1330        assert_eq!(cm.finish().as_slice(), "".as_bytes())
1331    }
1332
1333    #[test]
1334    fn test_chunk_masker_random_no_prefixes() {
1335        let mut rng = StdRng::seed_from_u64(0xdeadbeefabadcafe);
1336        for _ in 0..2000 {
1337            let num_keys = rng.gen_range(1..=5);
1338            let mut keys = Vec::new();
1339            for _ in 0..num_keys {
1340                let len = rng.gen_range(1..6);
1341                keys.push(random_string(&mut rng, len));
1342            }
1343
1344            let m = Masker::new(&keys, "X");
1345
1346            for _ in 0..1000 {
1347                let len = rng.gen_range(0..100);
1348                let input = random_input(&mut rng, &keys, len);
1349                let mut cm = m.mask_chunks();
1350                let mut output = Vec::new();
1351                let mut offset = 0;
1352                while offset < input.len() {
1353                    let chunk_len = rng.gen_range(0..(std::cmp::min(10, input.len() - offset + 1)));
1354                    let mut chunk = Vec::new();
1355                    for _ in 0..chunk_len {
1356                        chunk.push(input.as_bytes()[offset]);
1357                        offset += 1;
1358                    }
1359                    output.extend_from_slice(cm.mask_chunk(chunk).as_ref());
1360                }
1361                output.extend(cm.finish().as_slice());
1362                let output_as_string = String::from_utf8_lossy(&output);
1363                let check = mask_string_check(&input, "X", &keys);
1364                for key in keys.iter() {
1365                    assert!(
1366                        !output_as_string.contains(key),
1367                        "Key {} is contained in output {}",
1368                        key,
1369                        output_as_string
1370                    );
1371                }
1372                assert_eq!(output_as_string, check);
1373            }
1374        }
1375    }
1376
1377    #[test]
1378    fn test_chunk_masker_slabs() {
1379        let mut rng = StdRng::seed_from_u64(0xdeadbeefabadcafe);
1380        for input_type in 0..4 {
1381            for _ in 0..2 {
1382                let num_keys = rng.gen_range(1..15);
1383                let mut keys = Vec::new();
1384                for _ in 0..num_keys {
1385                    let len = rng.gen_range(10..50);
1386                    keys.push(random_buffer(&mut rng, len));
1387                }
1388
1389                let m = Masker::new(&keys, "XXXX-XXXX-XXXX-XXXX");
1390
1391                for _ in 0..3 {
1392                    let len = rng.gen_range(5_000_000..100_000_000);
1393                    let mut input = random_buffer(&mut rng, len);
1394                    match input_type {
1395                        0 => 0,
1396                        1 => add_random_keys(&mut rng, &keys, &mut input, 5),
1397                        2 => add_random_keys(&mut rng, &keys, &mut input, 20),
1398                        3 => add_separate_keys(&mut rng, &keys, &mut input, 20000),
1399                        _ => unreachable!(),
1400                    };
1401                    let mut cm = m.mask_chunks();
1402                    let mut output = Vec::new();
1403                    let mut offset = 0;
1404                    while offset < input.len() {
1405                        let chunk_len =
1406                            rng.gen_range(0..(std::cmp::min(10, input.len() - offset + 1)));
1407                        let mut chunk = Vec::new();
1408                        for _ in 0..chunk_len {
1409                            chunk.push(input[offset]);
1410                            offset += 1;
1411                        }
1412                        output.extend_from_slice(cm.mask_chunk(chunk).as_ref());
1413                    }
1414                    output.extend(cm.finish().as_slice());
1415
1416                    let check = mask_slice_check(&input, "XXXX-XXXX-XXXX-XXXX", &keys);
1417                    for key in keys.iter() {
1418                        assert!(
1419                            !slice_contains_slice(&output, key),
1420                            "Key {:?} is contained in output {:?}",
1421                            key,
1422                            output
1423                        );
1424                    }
1425                    diff_buffers(&output, &check);
1426                    assert_eq!(output, check);
1427                }
1428            }
1429        }
1430    }
1431
1432    fn mask_string_check_with_prefixes<S: AsRef<str>>(
1433        string: &str,
1434        mask: &str,
1435        keys: &[S],
1436        pfxes: &[MatchData],
1437    ) -> String {
1438        let key_spans = {
1439            let mut spans = Vec::new();
1440            for key in keys.iter() {
1441                let mut offset = 0usize;
1442                while let Some(ix) = string[offset..].find(key.as_ref()) {
1443                    let len = key.as_ref().as_bytes().len();
1444                    spans.push((offset + ix, offset + ix + len));
1445                    offset += ix + 1;
1446                }
1447            }
1448            spans
1449        };
1450
1451        let pfx_spans = {
1452            let mut spans = Vec::new();
1453            for pfx in pfxes.iter() {
1454                if !pfx.prefix.is_empty() {
1455                    let mut offset = 0usize;
1456                    while let Some(ix) =
1457                        string[offset..].find(String::from_utf8_lossy(pfx.prefix).as_ref())
1458                    {
1459                        let start_ix = if pfx.mask_prefix {
1460                            offset + ix
1461                        } else {
1462                            offset + ix + pfx.prefix.len()
1463                        };
1464                        let mut end_ix = offset + ix + pfx.prefix.len();
1465                        while end_ix < string.len()
1466                            && pfx.suffix.contains(&string.as_bytes()[end_ix])
1467                        {
1468                            end_ix += 1;
1469                        }
1470                        if end_ix > start_ix {
1471                            spans.push((start_ix, end_ix));
1472                        }
1473                        offset += ix + 1;
1474                        if offset >= string.as_bytes().len() {
1475                            break;
1476                        }
1477                    }
1478                }
1479            }
1480            spans
1481        };
1482
1483        let mut spans = key_spans;
1484        spans.extend(pfx_spans);
1485
1486        let mut unioned_spans = super::unify_spans(&spans);
1487        unioned_spans.sort();
1488
1489        let mut offset = 0usize;
1490        let mut res = Vec::new();
1491        for span in unioned_spans {
1492            if offset < span.0 {
1493                res.extend_from_slice(&string.as_bytes()[offset..span.0]);
1494            }
1495            res.extend_from_slice(mask.as_bytes());
1496            offset = span.1;
1497        }
1498        if offset < string.as_bytes().len() {
1499            res.extend_from_slice(&string.as_bytes()[offset..]);
1500        }
1501
1502        for pfx in pfxes.iter() {
1503            if pfx.prefix.is_empty() {
1504                let mut buf = Vec::new();
1505                for b in res.iter() {
1506                    if pfx.suffix.contains(b) {
1507                        buf.extend_from_slice(mask.as_bytes());
1508                    } else {
1509                        buf.push(*b);
1510                    }
1511                }
1512                std::mem::swap(&mut res, &mut buf);
1513            }
1514        }
1515
1516        String::from_utf8_lossy(&res).into()
1517    }
1518
1519    fn add_separate_keys_with_prefixes<R: Rng, S: AsRef<[u8]>>(
1520        mut rng: R,
1521        keys: &[S],
1522        pfxes: &[MatchData],
1523        buf: &mut Vec<u8>,
1524        gap: usize,
1525    ) -> usize {
1526        let mut offset = 0;
1527        let mut keys_added = 0;
1528        loop {
1529            let step = rng.gen_range((gap / 2)..gap);
1530            let key_ix = rng.gen_range(0..keys.len() + pfxes.len());
1531            offset += step;
1532            if offset >= buf.len() {
1533                break;
1534            }
1535            if key_ix < keys.len() {
1536                let key = &keys[key_ix];
1537                let end = std::cmp::min(buf.len(), offset + key.as_ref().len());
1538                let len = end - offset;
1539                buf[offset..(len + offset)].copy_from_slice(&key.as_ref()[..len]);
1540                offset += len;
1541            } else {
1542                let pfx = &pfxes[key_ix - keys.len()];
1543                let pfx_end = std::cmp::min(buf.len(), offset + pfx.prefix.len());
1544                let pfx_len = pfx_end - offset;
1545                buf[offset..(pfx_len + offset)].copy_from_slice(&pfx.prefix[..pfx_len]);
1546                offset += pfx_len;
1547                let suffix_len = if !pfx.suffix.is_empty() && offset < buf.len() {
1548                    rng.gen_range(0..std::cmp::min(64, buf.len() - offset))
1549                } else {
1550                    0
1551                };
1552                for _ in 0..suffix_len {
1553                    buf[offset] = pfx.suffix[rng.gen_range(0..pfx.suffix.len())];
1554                    offset += 1;
1555                }
1556            }
1557            keys_added += 1;
1558        }
1559        keys_added
1560    }
1561
1562    fn add_random_keys_with_prefixes<R: Rng, S: AsRef<[u8]>>(
1563        mut rng: R,
1564        keys: &[S],
1565        pfxes: &[MatchData],
1566        buf: &mut Vec<u8>,
1567        count: usize,
1568    ) -> usize {
1569        for _ in 0..count {
1570            let ix = rng.gen_range(0..(keys.len() + pfxes.len()));
1571            let offset = rng.gen_range(0..buf.len());
1572            if ix < keys.len() {
1573                let key = &keys[ix];
1574                let end = std::cmp::min(buf.len(), offset + key.as_ref().len());
1575                let len = end - offset;
1576                buf[offset..(len + offset)].copy_from_slice(&key.as_ref()[..len]);
1577            } else {
1578                let pfx = &pfxes[ix - keys.len()];
1579                let pfx_end = std::cmp::min(buf.len(), offset + pfx.prefix.len());
1580                let pfx_len = pfx_end - offset;
1581                buf[offset..(pfx_len + offset)].copy_from_slice(&pfx.prefix[..pfx_len]);
1582                let suffix_len = if !pfx.suffix.is_empty() && offset < buf.len() {
1583                    rng.gen_range(0..std::cmp::min(64, buf.len() - offset))
1584                } else {
1585                    0
1586                };
1587                for i in 0..suffix_len {
1588                    buf[offset + pfx_len + i] = pfx.suffix[rng.gen_range(0..pfx.suffix.len())];
1589                }
1590            }
1591        }
1592        count
1593    }
1594
1595    #[test]
1596    fn test_masker_with_prefixes() {
1597        let p = MatchData {
1598            prefix: "pfx-".as_ref(),
1599            suffix: "abcde".as_ref(),
1600            mask_prefix: false,
1601        };
1602        let inputs: &[&str] = &[];
1603        let m = Masker::new_with_match_data(inputs, &[p], "-MASKED-");
1604        assert_eq!(
1605            m.mask_str("pfx-aeebfcsfasgs"),
1606            "pfx--MASKED-fcsfasgs".to_string()
1607        );
1608
1609        let p = MatchData {
1610            prefix: "pfx-".as_ref(),
1611            suffix: "abcde".as_ref(),
1612            mask_prefix: true,
1613        };
1614        let inputs: &[&str] = &[];
1615        let m = Masker::new_with_match_data(inputs, &[p], "-MASKED-");
1616        assert_eq!(
1617            m.mask_str("pfx-aeebfcsfasgs"),
1618            "-MASKED-fcsfasgs".to_string()
1619        );
1620    }
1621
1622    #[test]
1623    fn test_masker_with_prefixes_random() {
1624        let mut rng = StdRng::seed_from_u64(0xdeadbeefabadcafe);
1625        for _ in 0..2000 {
1626            let num_keys = rng.gen_range(0..5);
1627            let mut keys = Vec::new();
1628            for _ in 0..num_keys {
1629                let len = rng.gen_range(1..6);
1630                keys.push(random_string(&mut rng, len));
1631            }
1632            let num_prefixes = rng.gen_range(0..3);
1633            let mut prefixes = Vec::new();
1634            let mut suffixes = Vec::new();
1635            for _ in 0..num_prefixes {
1636                let prefix_len = rng.gen_range(0..6);
1637                let pfx = random_string(&mut rng, prefix_len);
1638                let suffix_set = rng.gen_range(0..4);
1639                let suf = {
1640                    let mut s = String::new();
1641                    while s.len() < suffix_set {
1642                        let ch = rng.gen_range('a'..'e');
1643                        if !s.contains(ch) {
1644                            s.push(ch);
1645                        }
1646                    }
1647                    s
1648                };
1649                prefixes.push(pfx);
1650                suffixes.push(suf);
1651            }
1652            let mask_prefix = rng.gen_bool(0.5);
1653
1654            let mut pfxes = Vec::new();
1655            for i in 0..num_prefixes {
1656                pfxes.push(MatchData {
1657                    prefix: prefixes[i].as_ref(),
1658                    suffix: suffixes[i].as_ref(),
1659                    mask_prefix,
1660                });
1661            }
1662
1663            let m = Masker::new_with_match_data(&keys, &pfxes, "X");
1664
1665            for _ in 0..1000 {
1666                let len = rng.gen_range(0..100);
1667                let input = random_input(&mut rng, &keys, len);
1668                let output_as_string = m.mask_str(&input);
1669                let check = mask_string_check_with_prefixes(&input, "X", &keys, &pfxes);
1670                for key in keys.iter() {
1671                    assert!(
1672                        !output_as_string.contains(key),
1673                        "Key {} is contained in output {}",
1674                        key,
1675                        output_as_string
1676                    );
1677                }
1678                for pfx in pfxes.iter() {
1679                    if !pfx.prefix.is_empty() {
1680                        if pfx.mask_prefix {
1681                            assert!(
1682                                !output_as_string.contains(&*String::from_utf8_lossy(pfx.prefix)),
1683                                "Prefix {} is contained in output {}",
1684                                String::from_utf8_lossy(pfx.prefix),
1685                                output_as_string
1686                            );
1687                        } else {
1688                            for ix in output_as_string
1689                                .as_bytes()
1690                                .windows(pfx.prefix.len())
1691                                .enumerate()
1692                                .filter(|(_, w)| w == &pfx.prefix)
1693                                .map(|(ix, _)| ix)
1694                            {
1695                                if ix + pfx.prefix.len() == output_as_string.as_bytes().len() {
1696                                    break;
1697                                }
1698                                assert!(
1699                                    !pfx.suffix.contains(
1700                                        &output_as_string.as_bytes()[ix + pfx.prefix.len()]
1701                                    ),
1702                                    "Suffix char {} is present after prefix {} at offset {} in {}",
1703                                    char::from_u32(
1704                                        output_as_string.as_bytes()[ix + pfx.prefix.len()] as u32
1705                                    )
1706                                    .unwrap(),
1707                                    String::from_utf8_lossy(pfx.prefix),
1708                                    ix,
1709                                    output_as_string
1710                                );
1711                            }
1712                        }
1713                    } else {
1714                        for ch in pfx.suffix.iter() {
1715                            assert!(
1716                                !output_as_string.as_bytes().contains(ch),
1717                                "Suffix char {} of suffix {} is contained in output",
1718                                ch,
1719                                String::from_utf8_lossy(pfx.suffix)
1720                            );
1721                        }
1722                    }
1723                }
1724
1725                assert_eq!(output_as_string, check);
1726            }
1727        }
1728    }
1729
1730    #[test]
1731    fn test_chunk_masker_with_prefixes() {
1732        let p = MatchData {
1733            prefix: "pfx-".as_ref(),
1734            suffix: "abcde".as_ref(),
1735            mask_prefix: false,
1736        };
1737        let inputs: &[&str] = &[];
1738        let m = Masker::new_with_match_data(inputs, &[p], "-MASKED-");
1739        let mut cm = m.mask_chunks();
1740        assert_eq!(cm.mask_chunk("pf"), Vec::from("pf".as_bytes()));
1741        assert_eq!(cm.mask_chunk("x-a"), Vec::from("x-".as_bytes()));
1742        assert_eq!(cm.mask_chunk("eebfcs"), Vec::from("-MASKED-fcs"));
1743        assert_eq!(cm.mask_chunk("fasgs"), Vec::from("fasgs".as_bytes()));
1744        assert_eq!(cm.finish().as_slice(), "".as_bytes());
1745
1746        let p = MatchData {
1747            prefix: "pfx-".as_ref(),
1748            suffix: "abcde".as_ref(),
1749            mask_prefix: true,
1750        };
1751        let inputs: &[&str] = &[];
1752        let m = Masker::new_with_match_data(inputs, &[p], "-MASKED-");
1753        let mut cm = m.mask_chunks();
1754        assert_eq!(cm.mask_chunk("pf"), Vec::new());
1755        assert_eq!(cm.mask_chunk("x-a"), Vec::new());
1756        assert_eq!(cm.mask_chunk("eebfcs"), Vec::from("-MASKED-fcs"));
1757        assert_eq!(cm.mask_chunk("fasgs"), Vec::from("fasgs".as_bytes()));
1758        assert_eq!(cm.finish().as_slice(), "".as_bytes());
1759    }
1760
1761    #[test]
1762    fn test_chunk_masker_random_with_prefixes() {
1763        let mut rng = StdRng::seed_from_u64(0xdeadbeefabadcafe);
1764        for _ in 0..2000 {
1765            let num_keys = rng.gen_range(1..=5);
1766            let mut keys = Vec::new();
1767            for _ in 0..num_keys {
1768                let len = rng.gen_range(1..6);
1769                keys.push(random_string(&mut rng, len));
1770            }
1771            let num_prefixes = rng.gen_range(0..3);
1772            let mut prefixes = Vec::new();
1773            let mut suffixes = Vec::new();
1774            for _ in 0..num_prefixes {
1775                let prefix_len = rng.gen_range(0..6);
1776                let pfx = random_string(&mut rng, prefix_len);
1777                let suffix_set = rng.gen_range(0..4);
1778                let suf = {
1779                    let mut s = String::new();
1780                    while s.len() < suffix_set {
1781                        let ch = rng.gen_range('a'..'e');
1782                        if !s.contains(ch) {
1783                            s.push(ch);
1784                        }
1785                    }
1786                    s
1787                };
1788                prefixes.push(pfx);
1789                suffixes.push(suf);
1790            }
1791            let mask_prefix = rng.gen_bool(0.5);
1792
1793            let mut pfxes = Vec::new();
1794            for i in 0..num_prefixes {
1795                pfxes.push(MatchData {
1796                    prefix: prefixes[i].as_ref(),
1797                    suffix: suffixes[i].as_ref(),
1798                    mask_prefix,
1799                });
1800            }
1801
1802            let m = Masker::new_with_match_data(&keys, &pfxes, "X");
1803
1804            for _ in 0..1000 {
1805                let len = rng.gen_range(0..100);
1806                let input = random_input(&mut rng, &keys, len);
1807                let mut cm = m.mask_chunks();
1808                let mut output = Vec::new();
1809                let mut offset = 0;
1810                while offset < input.len() {
1811                    let chunk_len = rng.gen_range(0..(std::cmp::min(10, input.len() - offset + 1)));
1812                    let mut chunk = Vec::new();
1813                    for _ in 0..chunk_len {
1814                        chunk.push(input.as_bytes()[offset]);
1815                        offset += 1;
1816                    }
1817                    output.extend_from_slice(cm.mask_chunk(chunk).as_ref());
1818                }
1819                output.extend(cm.finish().as_slice());
1820                let output_as_string = String::from_utf8_lossy(&output);
1821                let check = mask_string_check_with_prefixes(&input, "X", &keys, &pfxes);
1822                for key in keys.iter() {
1823                    assert!(
1824                        !output_as_string.contains(key),
1825                        "Key {} is contained in output {}",
1826                        key,
1827                        output_as_string
1828                    );
1829                }
1830                for pfx in pfxes.iter() {
1831                    if !pfx.prefix.is_empty() {
1832                        if pfx.mask_prefix {
1833                            assert!(
1834                                !output_as_string.contains(&*String::from_utf8_lossy(pfx.prefix)),
1835                                "Prefix {} is contained in output {}",
1836                                String::from_utf8_lossy(pfx.prefix),
1837                                output_as_string
1838                            );
1839                        } else {
1840                            for ix in output_as_string
1841                                .as_bytes()
1842                                .windows(pfx.prefix.len())
1843                                .enumerate()
1844                                .filter(|(_, w)| w == &pfx.prefix)
1845                                .map(|(ix, _)| ix)
1846                            {
1847                                if ix + pfx.prefix.len() == output_as_string.as_bytes().len() {
1848                                    break;
1849                                }
1850                                assert!(
1851                                    !pfx.suffix.contains(
1852                                        &output_as_string.as_bytes()[ix + pfx.prefix.len()]
1853                                    ),
1854                                    "Suffix char {} is present after prefix {} at offset {} in {}",
1855                                    char::from_u32(
1856                                        output_as_string.as_bytes()[ix + pfx.prefix.len()] as u32
1857                                    )
1858                                    .unwrap(),
1859                                    String::from_utf8_lossy(pfx.prefix),
1860                                    ix,
1861                                    output_as_string
1862                                );
1863                            }
1864                        }
1865                    } else {
1866                        for ch in pfx.suffix.iter() {
1867                            assert!(
1868                                !output_as_string.as_bytes().contains(ch),
1869                                "Suffix char {} of suffix {} is contained in output",
1870                                ch,
1871                                String::from_utf8_lossy(pfx.suffix)
1872                            );
1873                        }
1874                    }
1875                }
1876
1877                assert_eq!(output_as_string, check);
1878            }
1879        }
1880    }
1881
1882    fn mask_slice_check_with_prefixes<S, T, U>(
1883        input: S,
1884        mask: T,
1885        keys: &[U],
1886        pfxes: &[MatchData],
1887    ) -> Vec<u8>
1888    where
1889        S: AsRef<[u8]>,
1890        T: AsRef<[u8]>,
1891        U: AsRef<[u8]>,
1892    {
1893        let key_spans = {
1894            let mut spans = Vec::new();
1895            for key in keys.iter() {
1896                for ix in input
1897                    .as_ref()
1898                    .windows(key.as_ref().len())
1899                    .enumerate()
1900                    .filter(|(_, window)| window == &key.as_ref())
1901                    .map(|(index, _)| index)
1902                {
1903                    let len = key.as_ref().len();
1904                    spans.push((ix, ix + len));
1905                }
1906            }
1907            spans
1908        };
1909
1910        let pfx_spans = {
1911            let mut spans = Vec::new();
1912            for pfx in pfxes.iter() {
1913                if !pfx.prefix.is_empty() {
1914                    for ix in input
1915                        .as_ref()
1916                        .windows(pfx.prefix.len())
1917                        .enumerate()
1918                        .filter(|(_, window)| window == &pfx.prefix)
1919                        .map(|(index, _)| index)
1920                    {
1921                        let len = pfx.prefix.len();
1922                        let end_ix = ix
1923                            + input.as_ref()[ix + len..]
1924                                .iter()
1925                                .position(|ch| !pfx.suffix.contains(ch))
1926                                .map(|i| len + i)
1927                                .unwrap_or_else(|| input.as_ref().len() - ix);
1928                        if pfx.mask_prefix {
1929                            spans.push((ix, end_ix));
1930                        } else if end_ix > ix + len {
1931                            spans.push((ix + len, end_ix));
1932                        }
1933                    }
1934                }
1935            }
1936            spans
1937        };
1938
1939        let mut spans = key_spans;
1940        spans.extend(pfx_spans);
1941
1942        let mut unioned_spans = super::unify_spans(&spans);
1943        unioned_spans.sort();
1944
1945        let mut offset = 0usize;
1946        let mut res = Vec::new();
1947        for span in unioned_spans {
1948            if offset < span.0 {
1949                res.extend_from_slice(&input.as_ref()[offset..span.0]);
1950            }
1951            res.extend_from_slice(mask.as_ref());
1952            offset = span.1;
1953        }
1954        if offset < input.as_ref().len() {
1955            res.extend_from_slice(&input.as_ref()[offset..]);
1956        }
1957
1958        for pfx in pfxes.iter() {
1959            if pfx.prefix.is_empty() {
1960                let mut buf = Vec::new();
1961                for b in res.iter() {
1962                    if pfx.suffix.contains(b) {
1963                        buf.extend_from_slice(mask.as_ref());
1964                    } else {
1965                        buf.push(*b);
1966                    }
1967                }
1968                std::mem::swap(&mut res, &mut buf);
1969            }
1970        }
1971
1972        res
1973    }
1974
1975    #[test]
1976    fn test_chunk_masker_slabs_with_prefixes() {
1977        let mut rng = StdRng::seed_from_u64(0xdeadbeefabadcafe);
1978        for input_type in 0..4 {
1979            for _ in 0..2 {
1980                let num_keys = rng.gen_range(1..15);
1981                let mut keys = Vec::new();
1982                for _ in 0..num_keys {
1983                    let len = rng.gen_range(10..50);
1984                    keys.push(random_buffer(&mut rng, len));
1985                }
1986
1987                let num_prefixes = rng.gen_range(0..10);
1988                let mut prefixes = Vec::new();
1989                let mut suffixes = Vec::new();
1990                for _ in 0..num_prefixes {
1991                    let prefix_len = rng.gen_range(0..15);
1992                    let pfx = random_buffer(&mut rng, prefix_len);
1993                    let suffix_set = rng.gen_range(0..50);
1994                    let suf = {
1995                        let mut s = Vec::new();
1996                        while s.len() < suffix_set {
1997                            let ch = rng.gen_range(0..50);
1998                            if !s.contains(&ch) {
1999                                s.push(ch);
2000                            }
2001                        }
2002                        s.sort();
2003                        s
2004                    };
2005                    prefixes.push(pfx);
2006                    suffixes.push(suf);
2007                }
2008                let mask_prefix = rng.gen_bool(0.5);
2009
2010                let mut pfxes = Vec::new();
2011                for i in 0..num_prefixes {
2012                    pfxes.push(MatchData {
2013                        prefix: prefixes[i].as_ref(),
2014                        suffix: suffixes[i].as_ref(),
2015                        mask_prefix,
2016                    });
2017                }
2018
2019                let m = Masker::new_with_match_data(&keys, &pfxes, "ABCD=EFGH=IJKL=MNOP");
2020
2021                for _ in 0..3 {
2022                    let len = rng.gen_range(5_000_000..100_000_000);
2023                    let mut input = random_buffer(&mut rng, len);
2024                    match input_type {
2025                        0 => 0,
2026                        1 => add_random_keys_with_prefixes(&mut rng, &keys, &pfxes, &mut input, 5),
2027                        2 => add_random_keys_with_prefixes(&mut rng, &keys, &pfxes, &mut input, 20),
2028                        3 => add_separate_keys_with_prefixes(
2029                            &mut rng, &keys, &pfxes, &mut input, 20000,
2030                        ),
2031                        _ => unreachable!(),
2032                    };
2033                    let mut cm = m.mask_chunks();
2034                    let mut output = Vec::new();
2035                    let mut offset = 0;
2036                    while offset < input.len() {
2037                        let chunk_len =
2038                            rng.gen_range(0..(std::cmp::min(10, input.len() - offset + 1)));
2039                        let mut chunk = Vec::new();
2040                        for _ in 0..chunk_len {
2041                            chunk.push(input[offset]);
2042                            offset += 1;
2043                        }
2044                        output.extend_from_slice(cm.mask_chunk(chunk).as_ref());
2045                    }
2046                    output.extend(cm.finish().as_slice());
2047
2048                    let check = mask_slice_check_with_prefixes(
2049                        &input,
2050                        "ABCD=EFGH=IJKL=MNOP",
2051                        &keys,
2052                        &pfxes,
2053                    );
2054                    for key in keys.iter() {
2055                        assert!(
2056                            !slice_contains_slice(&output, key),
2057                            "Key {:?} is contained in output {:?}",
2058                            key,
2059                            output
2060                        );
2061                    }
2062                    diff_buffers(&output, &check);
2063                    assert_eq!(output, check);
2064                }
2065            }
2066        }
2067    }
2068
2069    #[cfg(feature = "streams")]
2070    mod streams {
2071        use bytes::Bytes;
2072        use core::convert::Infallible;
2073        use core::pin::Pin;
2074        use core::task::{Context, Poll};
2075        use futures::{Stream, StreamExt};
2076
2077        use crate::Masker;
2078
2079        struct StringStream<'a> {
2080            data: &'a str,
2081            offset: usize,
2082        }
2083
2084        impl<'a> StringStream<'a> {
2085            pub fn new(data: &'a str) -> Self {
2086                Self { data, offset: 0 }
2087            }
2088        }
2089
2090        impl<'a> Stream for StringStream<'a> {
2091            type Item = Result<Bytes, Infallible>;
2092
2093            fn poll_next(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
2094                let this = self.get_mut();
2095                if this.offset < this.data.len() {
2096                    let ch = &this.data.as_bytes()[this.offset..this.offset + 1];
2097                    this.offset += 1;
2098                    Poll::Ready(Some(Ok(Bytes::copy_from_slice(ch))))
2099                } else {
2100                    Poll::Ready(None)
2101                }
2102            }
2103        }
2104
2105        async fn aggregate<T, E>(mut s: T) -> Result<Bytes, E>
2106        where
2107            T: Stream<Item = Result<Bytes, E>> + Unpin,
2108            E: core::fmt::Debug,
2109        {
2110            let mut v = Vec::new();
2111            while let Some(r) = s.next().await {
2112                match r {
2113                    Ok(bytes) => v.extend_from_slice(&bytes),
2114                    Err(e) => {
2115                        return Err(e);
2116                    }
2117                }
2118            }
2119            Ok(v.into())
2120        }
2121
2122        #[tokio::test]
2123        async fn test_stream_sanity() {
2124            let m = Masker::new(&["abcd", "1ab", "cde", "bce", "aa"], "-MASKED-");
2125
2126            assert_eq!(
2127                aggregate(m.mask_stream(StringStream::new("1abcdef")))
2128                    .await
2129                    .unwrap(),
2130                "-MASKED-f"
2131            );
2132            assert_eq!(
2133                aggregate(m.mask_stream(StringStream::new("1a")))
2134                    .await
2135                    .unwrap(),
2136                "1a"
2137            );
2138            assert_eq!(
2139                aggregate(m.mask_stream(StringStream::new("qqcdeblah")))
2140                    .await
2141                    .unwrap(),
2142                "qq-MASKED-blah"
2143            );
2144        }
2145    }
2146}