vastlint_core/fix.rs
1//! Automatic repair of common VAST XML issues.
2//!
3//! [`fix`] and [`fix_with_context`] validate a VAST document, apply all
4//! deterministic fixes, and return the repaired XML alongside a list of what
5//! was changed and what could not be automatically repaired.
6//!
7//! # What gets fixed
8//!
9//! Only issues with a single unambiguous correct form are auto-repaired:
10//!
11//! - **HTTP → HTTPS** in `<MediaFile>`, `<Tracking>`, `<Impression>`, and all
12//! other URL-bearing elements. The scheme is rewritten; nothing else changes.
13//! - **Deprecated `conditionalAd` attribute** removed from `<Ad>` elements on
14//! VAST 4.1+ documents.
15//!
16//! Issues that require human judgment (missing required elements, wrong enum
17//! values, structural problems) are left untouched and appear in
18//! [`FixResult::remaining`].
19//!
20//! # Lossy serialization
21//!
22//! The internal document model retains only elements, attributes, and text
23//! content. XML comments, processing instructions, and `<!DOCTYPE>` declarations
24//! are dropped during parsing and will not appear in the repaired output. This
25//! is intentional — VAST documents in the wild should not contain any of these.
26//!
27//! # Example
28//!
29//! ```rust
30//! let xml = r#"<VAST version="4.2">
31//! <Ad><InLine>
32//! <AdSystem>Demo</AdSystem>
33//! <AdTitle>Ad</AdTitle>
34//! <Impression>http://track.example.com/imp</Impression>
35//! <Creatives>
36//! <Creative>
37//! <Linear>
38//! <Duration>00:00:15</Duration>
39//! <MediaFiles>
40//! <MediaFile delivery="progressive" type="video/mp4"
41//! width="640" height="360">
42//! http://cdn.example.com/ad.mp4
43//! </MediaFile>
44//! </MediaFiles>
45//! </Linear>
46//! </Creative>
47//! </Creatives>
48//! </InLine></Ad>
49//! </VAST>"#;
50//!
51//! let result = vastlint_core::fix(xml);
52//! assert!(result.applied.iter().any(|f| f.rule_id == "VAST-2.0-mediafile-https"));
53//! // The repaired XML has https:// URLs.
54//! assert!(result.xml.contains("https://cdn.example.com/ad.mp4"));
55//! ```
56
57use crate::{Issue, ValidationContext};
58
59/// All element names whose text content is a URL (used to classify which
60/// HTTPS rule ID to report in AppliedFix).
61const URL_TEXT_ELEMENTS: &[&str] = &[
62 "MediaFile",
63 "Impression",
64 "Error",
65 "ClickThrough",
66 "ClickTracking",
67 "CustomClick",
68 "IconClickThrough",
69 "IconClickTracking",
70 "IconViewTracking",
71 "NonLinearClickThrough",
72 "NonLinearClickTracking",
73 "CompanionClickThrough",
74 "CompanionClickTracking",
75 "Viewable",
76 "NotViewable",
77 "ViewUndetermined",
78 "VASTAdTagURI",
79 "Tracking",
80];
81
82// ── Public types ──────────────────────────────────────────────────────────────
83
84/// A single fix that was successfully applied to the document.
85#[derive(Debug, Clone)]
86pub struct AppliedFix {
87 /// The rule ID this fix addresses, e.g. `"VAST-2.0-mediafile-https"`.
88 pub rule_id: &'static str,
89 /// Human-readable description of what was changed.
90 pub description: String,
91 /// XPath-like path to the element that was modified.
92 pub path: String,
93}
94
95/// The result of a [`fix`] or [`fix_with_context`] call.
96#[derive(Debug)]
97pub struct FixResult {
98 /// The repaired VAST XML. Always well-formed; may differ structurally from
99 /// the input if the input contained XML comments or processing instructions
100 /// (these are stripped — see module-level docs).
101 pub xml: String,
102 /// All fixes that were successfully applied, in document order.
103 pub applied: Vec<AppliedFix>,
104 /// Issues that remain after all fixes were applied. These require manual
105 /// intervention.
106 pub remaining: Vec<Issue>,
107}
108
109// ── Entry points ──────────────────────────────────────────────────────────────
110
111/// Fix a VAST XML string using default settings.
112///
113/// Applies all deterministic fixes and returns the repaired XML, a list of
114/// what was changed, and any issues that could not be automatically repaired.
115///
116/// For the list of fixable rules, see the module-level documentation.
117pub fn fix(input: &str) -> FixResult {
118 fix_with_context(input, ValidationContext::default())
119}
120
121/// Fix a VAST XML string with caller-supplied context.
122///
123/// Use this when you need to declare wrapper chain depth or override rule
124/// severity. For simple repair, prefer [`fix`].
125pub fn fix_with_context(input: &str, context: ValidationContext) -> FixResult {
126 let mut xml = input.to_owned();
127 let mut applied: Vec<AppliedFix> = Vec::new();
128
129 // ── HTTPS upgrade — raw string replacement ────────────────────────────────
130 // Operate directly on the raw XML string so CDATA sections, comments, and
131 // all formatting are preserved exactly. We replace every occurrence of
132 // "http://" with "https://"; in a VAST document the only http:// values
133 // are tracking/media URLs which should all be upgraded.
134 let http_count = xml.matches("http://").count();
135 if http_count > 0 {
136 xml = xml.replace("http://", "https://");
137
138 // Record one AppliedFix per affected URL element type found in the doc.
139 // Parse the pre-fix document to check which element types had http:// URLs.
140 let pre_doc = crate::parse::parse(input);
141 let mut had_mediafile_http = false;
142 let mut had_tracking_http = false;
143 check_http_elements(
144 &pre_doc.root,
145 &mut had_mediafile_http,
146 &mut had_tracking_http,
147 );
148
149 if had_mediafile_http {
150 applied.push(AppliedFix {
151 rule_id: "VAST-2.0-mediafile-https",
152 description: format!("Upgraded {} HTTP URL(s) to HTTPS", http_count),
153 path: "/VAST".to_owned(),
154 });
155 }
156 if had_tracking_http {
157 applied.push(AppliedFix {
158 rule_id: "VAST-2.0-tracking-https",
159 description: format!("Upgraded {} HTTP URL(s) to HTTPS", http_count),
160 path: "/VAST".to_owned(),
161 });
162 }
163 }
164
165 // ── conditionalAd removal — raw string replacement ────────────────────────
166 // Remove conditionalAd="..." (any quote style) from <Ad ...> tags.
167 // This preserves all other formatting.
168 let without_cond = remove_conditional_ad_attr(&xml);
169 if without_cond != xml {
170 applied.push(AppliedFix {
171 rule_id: "VAST-4.0-conditionalad",
172 description: "Removed deprecated conditionalAd attribute from <Ad>".to_owned(),
173 path: "/VAST".to_owned(),
174 });
175 xml = without_cond;
176 }
177
178 // Re-validate the repaired XML to find what remains.
179 let remaining = crate::validate_with_context(&xml, context).issues;
180
181 FixResult {
182 xml,
183 applied,
184 remaining,
185 }
186}
187
188/// Walk the parsed element tree and check which URL element types had http:// text.
189fn check_http_elements(
190 node: &crate::parse::Node,
191 had_mediafile: &mut bool,
192 had_tracking: &mut bool,
193) {
194 if node.text.starts_with("http://") {
195 if node.name == "MediaFile" {
196 *had_mediafile = true;
197 } else if URL_TEXT_ELEMENTS.contains(&node.name.as_str()) {
198 *had_tracking = true;
199 }
200 }
201 for child in &node.children {
202 check_http_elements(child, had_mediafile, had_tracking);
203 }
204}
205
206/// Remove `conditionalAd="..."` or `conditionalAd='...'` from any tag in the
207/// raw XML string. Uses a simple state-machine scan to avoid regex dependency.
208fn remove_conditional_ad_attr(input: &str) -> String {
209 const NEEDLE: &str = "conditionalAd=";
210 let mut out = String::with_capacity(input.len());
211 let mut rest = input;
212 while !rest.is_empty() {
213 // Look for conditionalAd= at the current position.
214 if rest.starts_with(NEEDLE) {
215 // Walk back to remove any preceding whitespace.
216 while out.ends_with(' ') || out.ends_with('\t') {
217 out.pop();
218 }
219 // Skip past "conditionalAd=" and the quoted value.
220 rest = &rest[NEEDLE.len()..];
221 if let Some(quote_char) = rest.chars().next() {
222 if quote_char == '"' || quote_char == '\'' {
223 rest = &rest[quote_char.len_utf8()..]; // skip opening quote
224 // Advance past the attribute value until the closing quote.
225 let close = rest.find(quote_char).unwrap_or(rest.len());
226 rest = &rest[close..];
227 // Skip the closing quote if present.
228 if rest.starts_with(quote_char) {
229 rest = &rest[quote_char.len_utf8()..];
230 }
231 }
232 }
233 } else {
234 // Advance one Unicode character at a time to stay on valid boundaries.
235 let ch = rest.chars().next().unwrap();
236 out.push(ch);
237 rest = &rest[ch.len_utf8()..];
238 }
239 }
240 out
241}
242
243// ── Tests ─────────────────────────────────────────────────────────────────────
244
245#[cfg(test)]
246mod tests {
247 use super::*;
248
249 const HTTP_VAST: &str = r#"<VAST version="4.2">
250 <Ad id="1"><InLine>
251 <AdSystem>Demo</AdSystem>
252 <AdTitle>Test</AdTitle>
253 <AdServingId>sid-1</AdServingId>
254 <Impression>http://track.example.com/imp</Impression>
255 <Creatives>
256 <Creative>
257 <UniversalAdId idRegistry="ad-id.org">UID-1</UniversalAdId>
258 <Linear>
259 <Duration>00:00:30</Duration>
260 <MediaFiles>
261 <MediaFile delivery="progressive" type="video/mp4"
262 width="1920" height="1080">
263 http://cdn.example.com/ad.mp4
264 </MediaFile>
265 </MediaFiles>
266 </Linear>
267 </Creative>
268 </Creatives>
269 </InLine></Ad>
270</VAST>"#;
271
272 #[test]
273 fn upgrades_mediafile_url_to_https() {
274 let result = fix(HTTP_VAST);
275 assert!(result.xml.contains("https://cdn.example.com/ad.mp4"));
276 assert!(!result.xml.contains("http://cdn.example.com/ad.mp4"));
277 assert!(result
278 .applied
279 .iter()
280 .any(|f| f.rule_id == "VAST-2.0-mediafile-https"));
281 }
282
283 #[test]
284 fn upgrades_impression_url_to_https() {
285 let result = fix(HTTP_VAST);
286 assert!(result.xml.contains("https://track.example.com/imp"));
287 assert!(result
288 .applied
289 .iter()
290 .any(|f| f.rule_id == "VAST-2.0-tracking-https"));
291 }
292
293 #[test]
294 fn https_urls_are_not_modified() {
295 let xml = HTTP_VAST.replace("http://cdn", "https://cdn");
296 let result = fix(&xml);
297 assert!(!result
298 .applied
299 .iter()
300 .any(|f| f.rule_id == "VAST-2.0-mediafile-https"));
301 assert!(result.xml.contains("https://cdn.example.com/ad.mp4"));
302 }
303
304 #[test]
305 fn removes_conditional_ad_attribute() {
306 let xml = r#"<VAST version="4.1">
307 <Ad id="1" conditionalAd="true"><InLine>
308 <AdSystem>Demo</AdSystem>
309 <AdTitle>Test</AdTitle>
310 <AdServingId>sid-1</AdServingId>
311 <Impression>https://t.example.com/imp</Impression>
312 <Creatives/>
313 </InLine></Ad>
314</VAST>"#;
315 let result = fix(xml);
316 assert!(!result.xml.contains("conditionalAd"));
317 assert!(result
318 .applied
319 .iter()
320 .any(|f| f.rule_id == "VAST-4.0-conditionalad"));
321 }
322
323 #[test]
324 fn repaired_xml_is_well_formed() {
325 let result = fix(HTTP_VAST);
326 // Round-trip: parsing the output should not produce a parse error.
327 let doc = crate::parse::parse(&result.xml);
328 assert!(doc.parse_error.is_none(), "{:?}", doc.parse_error);
329 }
330
331 #[test]
332 fn no_applied_fixes_on_clean_document() {
333 let clean = HTTP_VAST
334 .replace("http://cdn", "https://cdn")
335 .replace("http://track", "https://track");
336 let result = fix(&clean);
337 assert!(result.applied.is_empty());
338 }
339
340 #[test]
341 fn fix_result_remaining_only_contains_unfixable_issues() {
342 // After fixing HTTP URLs the remaining issues should not include
343 // mediafile-https or tracking-https.
344 let result = fix(HTTP_VAST);
345 let has_https_remaining = result
346 .remaining
347 .iter()
348 .any(|i| i.id == "VAST-2.0-mediafile-https" || i.id == "VAST-2.0-tracking-https");
349 assert!(!has_https_remaining);
350 }
351}