Skip to main content

vastlint_core/
fix.rs

1//! Automatic repair of common VAST XML issues.
2//!
3//! [`fix`] and [`fix_with_context`] validate a VAST document, apply all
4//! deterministic fixes, and return the repaired XML alongside a list of what
5//! was changed and what could not be automatically repaired.
6//!
7//! # What gets fixed
8//!
9//! Only issues with a single unambiguous correct form are auto-repaired:
10//!
11//! - **HTTP → HTTPS** in `<MediaFile>`, `<Tracking>`, `<Impression>`, and all
12//!   other URL-bearing elements. The scheme is rewritten; nothing else changes.
13//! - **Deprecated `conditionalAd` attribute** removed from `<Ad>` elements on
14//!   VAST 4.1+ documents.
15//!
16//! Issues that require human judgment (missing required elements, wrong enum
17//!   values, structural problems) are left untouched and appear in
18//!   [`FixResult::remaining`].
19//!
20//! # Lossy serialization
21//!
22//! The internal document model retains only elements, attributes, and text
23//! content. XML comments, processing instructions, and `<!DOCTYPE>` declarations
24//! are dropped during parsing and will not appear in the repaired output. This
25//! is intentional — VAST documents in the wild should not contain any of these.
26//!
27//! # Example
28//!
29//! ```rust
30//! let xml = r#"<VAST version="4.2">
31//!   <Ad><InLine>
32//!     <AdSystem>Demo</AdSystem>
33//!     <AdTitle>Ad</AdTitle>
34//!     <Impression>http://track.example.com/imp</Impression>
35//!     <Creatives>
36//!       <Creative>
37//!         <Linear>
38//!           <Duration>00:00:15</Duration>
39//!           <MediaFiles>
40//!             <MediaFile delivery="progressive" type="video/mp4"
41//!                        width="640" height="360">
42//!               http://cdn.example.com/ad.mp4
43//!             </MediaFile>
44//!           </MediaFiles>
45//!         </Linear>
46//!       </Creative>
47//!     </Creatives>
48//!   </InLine></Ad>
49//! </VAST>"#;
50//!
51//! let result = vastlint_core::fix(xml);
52//! assert!(result.applied.iter().any(|f| f.rule_id == "VAST-2.0-mediafile-https"));
53//! // The repaired XML has https:// URLs.
54//! assert!(result.xml.contains("https://cdn.example.com/ad.mp4"));
55//! ```
56
57use crate::{Issue, ValidationContext};
58
59/// All element names whose text content is a URL (used to classify which
60/// HTTPS rule ID to report in AppliedFix).
61const URL_TEXT_ELEMENTS: &[&str] = &[
62    "MediaFile",
63    "Impression",
64    "Error",
65    "ClickThrough",
66    "ClickTracking",
67    "CustomClick",
68    "IconClickThrough",
69    "IconClickTracking",
70    "IconViewTracking",
71    "NonLinearClickThrough",
72    "NonLinearClickTracking",
73    "CompanionClickThrough",
74    "CompanionClickTracking",
75    "Viewable",
76    "NotViewable",
77    "ViewUndetermined",
78    "VASTAdTagURI",
79    "Tracking",
80];
81
82// ── Public types ──────────────────────────────────────────────────────────────
83
84/// A single fix that was successfully applied to the document.
85#[derive(Debug, Clone)]
86pub struct AppliedFix {
87    /// The rule ID this fix addresses, e.g. `"VAST-2.0-mediafile-https"`.
88    pub rule_id: &'static str,
89    /// Human-readable description of what was changed.
90    pub description: String,
91    /// XPath-like path to the element that was modified.
92    pub path: String,
93}
94
95/// The result of a [`fix`] or [`fix_with_context`] call.
96#[derive(Debug)]
97pub struct FixResult {
98    /// The repaired VAST XML. Always well-formed; may differ structurally from
99    /// the input if the input contained XML comments or processing instructions
100    /// (these are stripped — see module-level docs).
101    pub xml: String,
102    /// All fixes that were successfully applied, in document order.
103    pub applied: Vec<AppliedFix>,
104    /// Issues that remain after all fixes were applied. These require manual
105    /// intervention.
106    pub remaining: Vec<Issue>,
107}
108
109// ── Entry points ──────────────────────────────────────────────────────────────
110
111/// Fix a VAST XML string using default settings.
112///
113/// Applies all deterministic fixes and returns the repaired XML, a list of
114/// what was changed, and any issues that could not be automatically repaired.
115///
116/// For the list of fixable rules, see the module-level documentation.
117pub fn fix(input: &str) -> FixResult {
118    fix_with_context(input, ValidationContext::default())
119}
120
121/// Fix a VAST XML string with caller-supplied context.
122///
123/// Use this when you need to declare wrapper chain depth or override rule
124/// severity. For simple repair, prefer [`fix`].
125pub fn fix_with_context(input: &str, context: ValidationContext) -> FixResult {
126    let mut xml = input.to_owned();
127    let mut applied: Vec<AppliedFix> = Vec::new();
128
129    // ── HTTPS upgrade — raw string replacement ────────────────────────────────
130    // Operate directly on the raw XML string so CDATA sections, comments, and
131    // all formatting are preserved exactly. We replace every occurrence of
132    // "http://" with "https://"; in a VAST document the only http:// values
133    // are tracking/media URLs which should all be upgraded.
134    let http_count = xml.matches("http://").count();
135    if http_count > 0 {
136        xml = xml.replace("http://", "https://");
137
138        // Record one AppliedFix per affected URL element type found in the doc.
139        // Parse the pre-fix document to check which element types had http:// URLs.
140        let pre_doc = crate::parse::parse(input);
141        let mut had_mediafile_http = false;
142        let mut had_tracking_http = false;
143        check_http_elements(
144            &pre_doc.root,
145            &mut had_mediafile_http,
146            &mut had_tracking_http,
147        );
148
149        if had_mediafile_http {
150            applied.push(AppliedFix {
151                rule_id: "VAST-2.0-mediafile-https",
152                description: format!("Upgraded {} HTTP URL(s) to HTTPS", http_count),
153                path: "/VAST".to_owned(),
154            });
155        }
156        if had_tracking_http {
157            applied.push(AppliedFix {
158                rule_id: "VAST-2.0-tracking-https",
159                description: format!("Upgraded {} HTTP URL(s) to HTTPS", http_count),
160                path: "/VAST".to_owned(),
161            });
162        }
163    }
164
165    // ── conditionalAd removal — raw string replacement ────────────────────────
166    // Remove conditionalAd="..." (any quote style) from <Ad ...> tags.
167    // This preserves all other formatting.
168    let without_cond = remove_conditional_ad_attr(&xml);
169    if without_cond != xml {
170        applied.push(AppliedFix {
171            rule_id: "VAST-4.0-conditionalad",
172            description: "Removed deprecated conditionalAd attribute from <Ad>".to_owned(),
173            path: "/VAST".to_owned(),
174        });
175        xml = without_cond;
176    }
177
178    // Re-validate the repaired XML to find what remains.
179    let remaining = crate::validate_with_context(&xml, context).issues;
180
181    FixResult {
182        xml,
183        applied,
184        remaining,
185    }
186}
187
188/// Walk the parsed element tree and check which URL element types had http:// text.
189fn check_http_elements(
190    node: &crate::parse::Node,
191    had_mediafile: &mut bool,
192    had_tracking: &mut bool,
193) {
194    if node.text.starts_with("http://") {
195        if node.name == "MediaFile" {
196            *had_mediafile = true;
197        } else if URL_TEXT_ELEMENTS.contains(&node.name.as_str()) {
198            *had_tracking = true;
199        }
200    }
201    for child in &node.children {
202        check_http_elements(child, had_mediafile, had_tracking);
203    }
204}
205
206/// Remove `conditionalAd="..."` or `conditionalAd='...'` from any tag in the
207/// raw XML string. Uses a simple state-machine scan to avoid regex dependency.
208fn remove_conditional_ad_attr(input: &str) -> String {
209    const NEEDLE: &str = "conditionalAd=";
210    let mut out = String::with_capacity(input.len());
211    let mut rest = input;
212    while !rest.is_empty() {
213        // Look for conditionalAd= at the current position.
214        if rest.starts_with(NEEDLE) {
215            // Walk back to remove any preceding whitespace.
216            while out.ends_with(' ') || out.ends_with('\t') {
217                out.pop();
218            }
219            // Skip past "conditionalAd=" and the quoted value.
220            rest = &rest[NEEDLE.len()..];
221            if let Some(quote_char) = rest.chars().next() {
222                if quote_char == '"' || quote_char == '\'' {
223                    rest = &rest[quote_char.len_utf8()..]; // skip opening quote
224                                                           // Advance past the attribute value until the closing quote.
225                    let close = rest.find(quote_char).unwrap_or(rest.len());
226                    rest = &rest[close..];
227                    // Skip the closing quote if present.
228                    if rest.starts_with(quote_char) {
229                        rest = &rest[quote_char.len_utf8()..];
230                    }
231                }
232            }
233        } else {
234            // Advance one Unicode character at a time to stay on valid boundaries.
235            let ch = rest.chars().next().unwrap();
236            out.push(ch);
237            rest = &rest[ch.len_utf8()..];
238        }
239    }
240    out
241}
242
243// ── Tests ─────────────────────────────────────────────────────────────────────
244
245#[cfg(test)]
246mod tests {
247    use super::*;
248
249    const HTTP_VAST: &str = r#"<VAST version="4.2">
250  <Ad id="1"><InLine>
251    <AdSystem>Demo</AdSystem>
252    <AdTitle>Test</AdTitle>
253    <AdServingId>sid-1</AdServingId>
254    <Impression>http://track.example.com/imp</Impression>
255    <Creatives>
256      <Creative>
257        <UniversalAdId idRegistry="ad-id.org">UID-1</UniversalAdId>
258        <Linear>
259          <Duration>00:00:30</Duration>
260          <MediaFiles>
261            <MediaFile delivery="progressive" type="video/mp4"
262                       width="1920" height="1080">
263              http://cdn.example.com/ad.mp4
264            </MediaFile>
265          </MediaFiles>
266        </Linear>
267      </Creative>
268    </Creatives>
269  </InLine></Ad>
270</VAST>"#;
271
272    #[test]
273    fn upgrades_mediafile_url_to_https() {
274        let result = fix(HTTP_VAST);
275        assert!(result.xml.contains("https://cdn.example.com/ad.mp4"));
276        assert!(!result.xml.contains("http://cdn.example.com/ad.mp4"));
277        assert!(result
278            .applied
279            .iter()
280            .any(|f| f.rule_id == "VAST-2.0-mediafile-https"));
281    }
282
283    #[test]
284    fn upgrades_impression_url_to_https() {
285        let result = fix(HTTP_VAST);
286        assert!(result.xml.contains("https://track.example.com/imp"));
287        assert!(result
288            .applied
289            .iter()
290            .any(|f| f.rule_id == "VAST-2.0-tracking-https"));
291    }
292
293    #[test]
294    fn https_urls_are_not_modified() {
295        let xml = HTTP_VAST.replace("http://cdn", "https://cdn");
296        let result = fix(&xml);
297        assert!(!result
298            .applied
299            .iter()
300            .any(|f| f.rule_id == "VAST-2.0-mediafile-https"));
301        assert!(result.xml.contains("https://cdn.example.com/ad.mp4"));
302    }
303
304    #[test]
305    fn removes_conditional_ad_attribute() {
306        let xml = r#"<VAST version="4.1">
307  <Ad id="1" conditionalAd="true"><InLine>
308    <AdSystem>Demo</AdSystem>
309    <AdTitle>Test</AdTitle>
310    <AdServingId>sid-1</AdServingId>
311    <Impression>https://t.example.com/imp</Impression>
312    <Creatives/>
313  </InLine></Ad>
314</VAST>"#;
315        let result = fix(xml);
316        assert!(!result.xml.contains("conditionalAd"));
317        assert!(result
318            .applied
319            .iter()
320            .any(|f| f.rule_id == "VAST-4.0-conditionalad"));
321    }
322
323    #[test]
324    fn repaired_xml_is_well_formed() {
325        let result = fix(HTTP_VAST);
326        // Round-trip: parsing the output should not produce a parse error.
327        let doc = crate::parse::parse(&result.xml);
328        assert!(doc.parse_error.is_none(), "{:?}", doc.parse_error);
329    }
330
331    #[test]
332    fn no_applied_fixes_on_clean_document() {
333        let clean = HTTP_VAST
334            .replace("http://cdn", "https://cdn")
335            .replace("http://track", "https://track");
336        let result = fix(&clean);
337        assert!(result.applied.is_empty());
338    }
339
340    #[test]
341    fn fix_result_remaining_only_contains_unfixable_issues() {
342        // After fixing HTTP URLs the remaining issues should not include
343        // mediafile-https or tracking-https.
344        let result = fix(HTTP_VAST);
345        let has_https_remaining = result
346            .remaining
347            .iter()
348            .any(|i| i.id == "VAST-2.0-mediafile-https" || i.id == "VAST-2.0-tracking-https");
349        assert!(!has_https_remaining);
350    }
351}