minify_html_onepass/proc/
mod.rs

1use crate::err::debug_repr;
2use crate::err::Error;
3use crate::err::ErrorType;
4use crate::err::ProcessingResult;
5use crate::proc::range::ProcessorRange;
6use crate::proc::MatchAction::*;
7use crate::proc::MatchMode::*;
8use aho_corasick::AhoCorasick;
9use core::fmt;
10use memchr::memchr;
11use minify_html_common::gen::codepoints::Lookup;
12use minify_html_common::spec::tag::EMPTY_SLICE;
13use std::fmt::Debug;
14use std::fmt::Formatter;
15use std::ops::Index;
16use std::ops::IndexMut;
17
18pub mod checkpoint;
19pub mod entity;
20pub mod range;
21
22#[allow(dead_code)]
23pub enum MatchMode {
24  IsChar(u8),
25  IsNotChar(u8),
26  WhileChar(u8),
27  WhileNotChar(u8),
28  // Through is like WhileNot followed by Is, but matches zero if Is is zero.
29  ThroughChar(u8),
30
31  IsPred(fn(u8) -> bool),
32  IsNotPred(fn(u8) -> bool),
33  WhilePred(fn(u8) -> bool),
34  WhileNotPred(fn(u8) -> bool),
35
36  IsInLookup(&'static Lookup),
37  WhileInLookup(&'static Lookup),
38  WhileNotInLookup(&'static Lookup),
39
40  IsSeq(&'static [u8]),
41  WhileNotSeq(&'static AhoCorasick),
42  ThroughSeq(&'static AhoCorasick),
43}
44
45pub enum MatchAction {
46  Keep,
47  Discard,
48  MatchOnly,
49}
50
51// Processing state of a file. Single use only; create one per processing.
52pub struct Processor<'d> {
53  code: &'d mut [u8],
54  // Index of the next character to read.
55  read_next: usize,
56  // Index of the next unwritten space.
57  write_next: usize,
58}
59
60impl<'d> Index<ProcessorRange> for Processor<'d> {
61  type Output = [u8];
62
63  #[inline(always)]
64  fn index(&self, index: ProcessorRange) -> &Self::Output {
65    &self.code[index.start..index.end]
66  }
67}
68
69impl<'d> IndexMut<ProcessorRange> for Processor<'d> {
70  #[inline(always)]
71  fn index_mut(&mut self, index: ProcessorRange) -> &mut Self::Output {
72    debug_assert!(index.end <= self.write_next);
73    &mut self.code[index.start..index.end]
74  }
75}
76
77#[allow(dead_code)]
78impl<'d> Processor<'d> {
79  // Constructor.
80  #[inline(always)]
81  pub fn new(code: &mut [u8]) -> Processor {
82    Processor {
83      write_next: 0,
84      read_next: 0,
85      code,
86    }
87  }
88
89  // INTERNAL APIs.
90  // Bounds checking.
91  #[inline(always)]
92  fn _in_bounds(&self, offset: usize) -> bool {
93    self.read_next + offset < self.code.len()
94  }
95
96  // Reading.
97  /// Get the `offset` character from next.
98  /// When `offset` is 0, the next character is returned.
99  /// Panics. Does not check bounds for performance (e.g. already checked).
100  #[inline(always)]
101  fn _read_offset(&self, offset: usize) -> u8 {
102    self.code[self.read_next + offset]
103  }
104
105  #[inline(always)]
106  fn _maybe_read_offset(&self, offset: usize) -> Option<u8> {
107    self.code.get(self.read_next + offset).copied()
108  }
109
110  #[inline(always)]
111  fn _maybe_read_slice_offset(&self, offset: usize, count: usize) -> Option<&[u8]> {
112    self
113      .code
114      .get(self.read_next + offset..self.read_next + offset + count)
115  }
116
117  /// Move next `amount` characters to output.
118  /// Panics. Does not check bounds for performance (e.g. already checked).
119  #[inline(always)]
120  fn _shift(&mut self, amount: usize) {
121    // Optimisation: Don't shift if already there (but still update offsets).
122    if self.read_next != self.write_next {
123      self
124        .code
125        .copy_within(self.read_next..self.read_next + amount, self.write_next);
126    };
127    self.read_next += amount;
128    self.write_next += amount;
129  }
130
131  #[inline(always)]
132  fn _replace(&mut self, start: usize, end: usize, data: &[u8]) -> usize {
133    debug_assert!(start <= end);
134    let added = data.len() - (end - start);
135    // Do not allow writing over source.
136    debug_assert!(self.write_next + added <= self.read_next);
137    self.code.copy_within(end..self.write_next, end + added);
138    self.code[start..start + data.len()].copy_from_slice(data);
139    // Don't need to update read_next as only data before it has changed.
140    self.write_next += added;
141    added
142  }
143
144  #[inline(always)]
145  fn _insert(&mut self, at: usize, data: &[u8]) -> usize {
146    self._replace(at, at, data)
147  }
148
149  // Matching.
150  #[inline(always)]
151  fn _one<C: FnOnce(u8) -> bool>(&mut self, cond: C) -> usize {
152    self._maybe_read_offset(0).filter(|n| cond(*n)).is_some() as usize
153  }
154
155  #[inline(always)]
156  fn _many<C: Fn(u8) -> bool>(&mut self, cond: C) -> usize {
157    let mut count = 0usize;
158    while self
159      ._maybe_read_offset(count)
160      .filter(|c| cond(*c))
161      .is_some()
162    {
163      count += 1;
164    }
165    count
166  }
167
168  #[inline(always)]
169  fn _remaining(&self) -> usize {
170    self.code.len() - self.read_next
171  }
172
173  #[inline(always)]
174  pub fn m(&mut self, mode: MatchMode, action: MatchAction) -> ProcessorRange {
175    let count = match mode {
176      IsChar(c) => self._one(|n| n == c),
177      IsNotChar(c) => self._one(|n| n != c),
178      WhileChar(c) => self._many(|n| n == c),
179      WhileNotChar(c) => memchr(c, &self.code[self.read_next..]).unwrap_or(self._remaining()),
180      ThroughChar(c) => memchr(c, &self.code[self.read_next..]).map_or(0, |p| p + 1),
181
182      IsInLookup(lookup) => self._one(|n| lookup[n]),
183      WhileInLookup(lookup) => self._many(|n| lookup[n]),
184      WhileNotInLookup(lookup) => self._many(|n| !lookup[n]),
185
186      IsPred(p) => self._one(p),
187      IsNotPred(p) => self._one(|n| !p(n)),
188      WhilePred(p) => self._many(p),
189      WhileNotPred(p) => self._many(|n| !p(n)),
190
191      IsSeq(seq) => self
192        ._maybe_read_slice_offset(0, seq.len())
193        .filter(|src| *src == seq)
194        .map_or(0, |_| seq.len()),
195      WhileNotSeq(seq) => seq
196        .find(&self.code[self.read_next..])
197        .map_or(self._remaining(), |m| m.start()),
198      // Match.end is exclusive, so do not add one.
199      ThroughSeq(seq) => seq
200        .find(&self.code[self.read_next..])
201        .map_or(0, |m| m.end()),
202    };
203    // If keeping, match will be available in written range (which is better as source might eventually get overwritten).
204    // If discarding, then only option is source range.
205    let start = match action {
206      Discard | MatchOnly => self.read_next,
207      Keep => self.write_next,
208    };
209    match action {
210      Discard => self.read_next += count,
211      Keep => self._shift(count),
212      MatchOnly => {}
213    };
214
215    ProcessorRange {
216      start,
217      end: start + count,
218    }
219  }
220
221  // PUBLIC APIs.
222  // Bounds checking
223  #[inline(always)]
224  pub fn at_end(&self) -> bool {
225    !self._in_bounds(0)
226  }
227
228  #[inline(always)]
229  pub fn get_or_empty(&self, r: Option<ProcessorRange>) -> &[u8] {
230    r.and_then(|r| self.code.get(r.start..r.end))
231      .unwrap_or(EMPTY_SLICE)
232  }
233
234  #[inline(always)]
235  pub fn require_not_at_end(&self) -> ProcessingResult<()> {
236    if self.at_end() {
237      Err(ErrorType::UnexpectedEnd)
238    } else {
239      Ok(())
240    }
241  }
242
243  /// Get how many characters have been consumed from source.
244  #[inline(always)]
245  pub fn read_len(&self) -> usize {
246    self.read_next
247  }
248
249  #[inline(always)]
250  pub fn reserve_output(&mut self, amount: usize) {
251    self.write_next += amount;
252  }
253
254  // Looking ahead.
255  /// Get the `offset` character from next.
256  /// When `offset` is 0, the next character is returned.
257  #[inline(always)]
258  pub fn peek(&self, offset: usize) -> Option<u8> {
259    self._maybe_read_offset(offset)
260  }
261
262  #[inline(always)]
263  pub fn peek_many(&self, offset: usize, count: usize) -> Option<&[u8]> {
264    self._maybe_read_slice_offset(offset, count)
265  }
266
267  // Looking behind.
268  pub fn last_is(&self, c: u8) -> bool {
269    self.write_next > 0 && self.code[self.write_next - 1] == c
270  }
271
272  // Consuming source characters.
273  /// Skip and return the next character.
274  /// Will result in an error if exceeds bounds.
275  #[inline(always)]
276  pub fn skip(&mut self) -> ProcessingResult<u8> {
277    self
278      ._maybe_read_offset(0)
279      .map(|c| {
280        self.read_next += 1;
281        c
282      })
283      .ok_or(ErrorType::UnexpectedEnd)
284  }
285
286  #[inline(always)]
287  pub fn skip_amount_expect(&mut self, amount: usize) {
288    debug_assert!(!self.at_end(), "skip known characters");
289    self.read_next += amount;
290  }
291
292  #[inline(always)]
293  pub fn skip_expect(&mut self) {
294    debug_assert!(!self.at_end(), "skip known character");
295    self.read_next += 1;
296  }
297
298  // Writing characters directly.
299  /// Write `c` to output. Will panic if exceeds bounds.
300  #[inline(always)]
301  pub fn write(&mut self, c: u8) {
302    self.code[self.write_next] = c;
303    self.write_next += 1;
304  }
305
306  #[inline(always)]
307  pub fn make_lowercase(&mut self, range: ProcessorRange) {
308    self.code[range.start..range.end].make_ascii_lowercase();
309  }
310
311  pub fn undo_write(&mut self, len: usize) {
312    self.write_next -= len;
313  }
314
315  #[inline(always)]
316  pub fn write_range(&mut self, s: ProcessorRange) -> ProcessorRange {
317    let dest_start = self.write_next;
318    let dest_end = dest_start + s.len();
319    self.code.copy_within(s.start..s.end, dest_start);
320    self.write_next = dest_end;
321    ProcessorRange {
322      start: dest_start,
323      end: dest_end,
324    }
325  }
326
327  /// Write `s` to output. Will panic if exceeds bounds.
328  #[inline(always)]
329  pub fn write_slice(&mut self, s: &[u8]) {
330    self.code[self.write_next..self.write_next + s.len()].copy_from_slice(s);
331    self.write_next += s.len();
332  }
333
334  #[inline(always)]
335  pub fn write_utf8(&mut self, c: char) {
336    let mut encoded = [0u8; 4];
337    self.write_slice(c.encode_utf8(&mut encoded).as_bytes());
338  }
339
340  // Shifting characters.
341  #[inline(always)]
342  pub fn accept(&mut self) -> ProcessingResult<u8> {
343    self
344      ._maybe_read_offset(0)
345      .map(|c| {
346        self.code[self.write_next] = c;
347        self.read_next += 1;
348        self.write_next += 1;
349        c
350      })
351      .ok_or(ErrorType::UnexpectedEnd)
352  }
353
354  #[inline(always)]
355  pub fn accept_expect(&mut self) -> u8 {
356    debug_assert!(!self.at_end());
357    let c = self._read_offset(0);
358    self.code[self.write_next] = c;
359    self.read_next += 1;
360    self.write_next += 1;
361    c
362  }
363
364  #[inline(always)]
365  pub fn accept_amount_expect(&mut self, count: usize) {
366    debug_assert!(self._in_bounds(count - 1));
367    self._shift(count);
368  }
369
370  // Since we consume the Processor, we must provide a full Error with positions.
371  #[inline(always)]
372  pub fn finish(self) -> Result<usize, Error> {
373    debug_assert!(self.at_end());
374    Ok(self.write_next)
375  }
376}
377
378impl Debug for Processor<'_> {
379  fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
380    f.write_str(&debug_repr(
381      self.code,
382      self.read_next as isize,
383      self.write_next as isize,
384    ))?;
385    Ok(())
386  }
387}