1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
use crate::{
Element,
dom_walker::is_block_element,
element_handler::{HandlerResult, Handlers},
node_util::parent_tag_name_equals,
options::TranslationMode,
text_util::concat_strings,
};
use html5ever::serialize::{HtmlSerializer, SerializeOpts, Serializer, TraversalScope, serialize};
use markup5ever_rcdom::{NodeData, SerializableHandle};
use std::io::{self, Write};
// A handler for tags whose only criteria (for faithful translation) is the tag
// name of the parent.
pub(super) fn handle_or_serialize_by_parent(
handlers: &dyn Handlers,
// The element to check.
element: &Element,
// A list of allowable tag names for this element's parent.
tag_names: &Vec<&str>,
// The value for `markdown_translate` to pass if this tag is markdown translatable.
markdown_translated: bool,
) -> Option<HandlerResult> {
// In faithful mode, fall back to HTML when this element's parent tag is not
// in `tag_names` (e.g., `<tbody>` outside `<table>`, `<td>` outside `<tr>`, etc.).
if handlers.options().translation_mode == TranslationMode::Faithful
&& !parent_tag_name_equals(element.node, tag_names)
{
Some(HandlerResult {
content: serialize_element(handlers, element),
markdown_translated: false,
})
} else {
let content = handlers.walk_children(element.node).content;
let content = content.trim_matches('\n');
Some(HandlerResult {
content: concat_strings!("\n\n", content, "\n\n"),
markdown_translated,
})
}
}
// Given a node (which must be an element), serialize it (transform it back
// to HTML).
pub(crate) fn serialize_element(handlers: &dyn Handlers, element: &Element) -> String {
let f = || -> io::Result<String> {
let so = SerializeOpts {
traversal_scope: TraversalScope::IncludeNode,
..Default::default()
};
let mut bytes = vec![];
// If this is a block element, then serialize it and all its children.
// Otherwise, serialize just this element, but use the current contents in
// the place of children. This follows the Commonmark spec: [HTML
// blocks](https://spec.commonmark.org/0.31.2/#html-blocks) contain only
// HTML, not Markdown, while [raw HTML
// inlines](https://spec.commonmark.org/0.31.2/#raw-html) contain Markdown.
if !is_block_element(element.tag) {
// Write this element's start tag.
let NodeData::Element { name, attrs, .. } = &element.node.data else {
return Err(io::Error::other("Not an element.".to_string()));
};
let mut ser = HtmlSerializer::new(&mut bytes, so.clone());
ser.start_elem(
name.clone(),
attrs.borrow().iter().map(|at| (&at.name, &at.value[..])),
)?;
// Write out the contents, without escaping them. The standard serialization process escapes the contents, hence this manual approach.
ser.writer
.write_all(handlers.walk_children(element.node).content.as_bytes())?;
// Write the end tag, if needed (HtmlSerializer logic will automatically omit this).
ser.end_elem(name.clone())?;
String::from_utf8(bytes).map_err(io::Error::other)
} else {
let sh: SerializableHandle = SerializableHandle::from(element.node.clone());
serialize(&mut bytes, &sh, so)?;
let s = String::from_utf8(bytes).map_err(io::Error::other)?;
// We must avoid consecutive newlines in HTML blocks, since this
// terminates the block per the CommonMark spec. Therefore, this
// code replaces instances of two or more newlines with a single
// newline, followed by escaped newlines. This is a hand-coded
// version of the following regex:
//
// ```Rust
// Regex::new(r#"(\r?\n\s*)(\r?\n\s*)"#).unwrap())
// .replace_all(&s, |caps: &Captures| {
// caps[1].to_string()
// + &(caps[2].replace("\r", " ").replace("\n", " "))
// })
// ```
//
// 1. If the next character is an \\r or \\n, output it.
// 2. If the previous character was a \\r and the next
// character isn't a \\n, restart. Otherwise, output the
// \\n.
// 3. If the next character is whitespace but not \\n or \\r,
// output it then repeat this step.
// 4. If the next character is a \\r and the peeked following
// character isn't an \\n, output the \\r and restart.
// Otherwise, output an encoded \\r.
// 5. If the peeked next character is a \\n, output an encoded
// \\n. Otherwise, restart.
// 6. If the next character is whitespace but not \\n or \\r,
// output it then repeat this step. Otherwise, restart.
//
// Replace instances of two or more newlines with a newline
// followed by escaped newlines
let mut result = String::with_capacity(s.len());
let mut chars = s.chars().peekable();
while let Some(c) = chars.next() {
// Step 1.
if c == '\r' || c == '\n' {
result.push(c);
// Step 2.
if c == '\r' {
if chars.peek() == Some(&'\n') {
result.push(chars.next().unwrap());
} else {
continue;
}
}
// Step 3: Skip any whitespace after the newline.
while let Some(&next) = chars.peek() {
if next.is_whitespace() && next != '\r' && next != '\n' {
result.push(next);
chars.next();
} else {
break;
}
}
// Step 4.
if let Some(c) = chars.next() {
if c == '\r' || c == '\n' {
if c == '\r' {
if chars.peek() == Some(&'\n') {
chars.next();
result.push_str(" ");
} else {
// Step 6.
result.push('\r');
continue;
}
} else {
result.push_str(" ");
}
// Step 6.
while let Some(&next) = chars.peek() {
if next.is_whitespace() && next != '\r' && next != '\n' {
result.push(next);
chars.next();
} else {
break;
}
}
} else {
result.push(c);
}
}
} else {
result.push(c);
}
}
Ok(concat_strings!("\n\n", result, "\n\n"))
}
};
match f() {
Ok(s) => s,
Err(err) => err.to_string(),
}
}
// When in faithful translation mode, return an HTML translation if this element
// has more than the allowed number of attributes.
#[macro_export]
macro_rules! serialize_if_faithful {
(
// The handlers to use for serialization.
$handlers: expr,
// The element to translate.
$element: expr,
// The maximum number of attributes allowed for this element. Supply
// -1 to serialize in faithful mode, even with no attributes.
$num_attrs_allowed: expr
) => {
if $handlers.options().translation_mode == $crate::options::TranslationMode::Faithful
&& $element.attrs.len() as i64 > $num_attrs_allowed
{
return Some($crate::element_handler::HandlerResult {
content: $crate::element_handler::element_util::serialize_element(
$handlers, &$element,
),
// This was translated using HTML, not Markdown.
markdown_translated: false,
});
}
};
}