1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
use std::char::from_u32;
use crate::err::ProcessingResult;
use crate::gen::entities::ENTITY;
use crate::proc::checkpoint::Checkpoint;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};
#[derive(Clone, Copy)]
pub enum EntityType {
Malformed(ProcessorRange),
Ascii(u8),
Named(&'static [u8]),
InvalidNumeric,
Numeric(char),
}
impl EntityType {
pub fn keep(self, proc: &mut Processor) -> () {
match self {
EntityType::Malformed(r) => { proc.write_range(r); }
EntityType::Ascii(c) => { proc.write(c); }
EntityType::Named(s) => { proc.write_slice(s); }
EntityType::InvalidNumeric => { proc.write_utf8('\u{FFFD}'); }
EntityType::Numeric(c) => { proc.write_utf8(c); }
};
}
}
fn parse_numeric(proc: &mut Processor, skip_amount: usize, max_len: usize, digit_pred: fn(u8) -> bool, on_digit: fn(u32, u8) -> u32) -> Option<EntityType> {
proc.skip_amount_expect(skip_amount);
let has_leading_zeros = proc.m(WhileChar(b'0'), Discard).nonempty();
let raw = proc.m(WhilePred(digit_pred), Discard);
proc.m(IsChar(b';'), Discard);
if raw.empty() {
if has_leading_zeros {
Some(EntityType::Ascii(b'\0'))
} else {
None
}
} else if raw.len() > max_len {
Some(EntityType::InvalidNumeric)
} else {
let mut val = 0u32;
for c in &proc[raw] {
val = on_digit(val, *c);
};
Some(from_u32(val)
.map(|c| if c.is_ascii() {
EntityType::Ascii(c as u8)
} else {
EntityType::Numeric(c)
})
.unwrap_or(EntityType::InvalidNumeric))
}
}
fn parse_name(proc: &mut Processor) -> Option<EntityType> {
proc.m_trie(ENTITY, Discard).map(|s| match s.len() {
1 => EntityType::Ascii(s[0]),
_ => EntityType::Named(s)
})
}
pub fn parse_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
let checkpoint = Checkpoint::new(proc);
proc.m(IsChar(b'&'), Discard).expect();
let entity_type = match proc.peek(0) {
Some(b'#') => match proc.peek(1) {
Some(b'x') => parse_numeric(proc, 2, 6, is_hex_digit, |val, c| val * 16 + match c {
c if is_digit(c) => c - b'0',
c if is_upper_hex_digit(c) => c - b'A' + 10,
c if is_lower_hex_digit(c) => c - b'a' + 10,
_ => unreachable!(),
} as u32),
_ => parse_numeric(proc, 1, 7, is_digit, |val, c| val * 10 + (c - b'0') as u32),
},
_ => parse_name(proc),
}.unwrap_or_else(|| EntityType::Malformed(checkpoint.consumed_range(proc)));
Ok(entity_type)
}