Skip to main content

provable_contracts/
doc_integrity.rs

1//! Document integrity validation for Markdown and SVG files.
2//!
3//! Enforces structural invariants from the `document-integrity-v1` contract:
4//! heading hierarchy, link well-formedness, code fence language tags, GFM
5//! table column parity, SVG structural safety, required sections, and
6//! README drift detection.
7
8use std::path::Path;
9
10/// A single document validation violation with location and rule info.
11#[derive(Debug, Clone, PartialEq, Eq)]
12pub struct DocViolation {
13    pub line: usize,
14    pub rule: &'static str,
15    pub message: String,
16}
17
18/// Result of comparing an actual README against a generated one.
19#[derive(Debug, Clone, PartialEq, Eq)]
20pub struct DriftResult {
21    pub stale: bool,
22    pub diff_lines: usize,
23}
24
25/// Validate heading hierarchy: first heading is H1, exactly one H1, no level skips.
26pub fn validate_heading_hierarchy(md: &str) -> Vec<DocViolation> {
27    let mut violations = Vec::new();
28    let mut headings: Vec<(usize, usize)> = Vec::new();
29    let mut in_fence = false;
30
31    for (idx, line) in md.lines().enumerate() {
32        let trimmed = line.trim_start();
33        if trimmed.starts_with("```") {
34            in_fence = !in_fence;
35            continue;
36        }
37        if in_fence {
38            continue;
39        }
40        if !trimmed.starts_with('#') {
41            continue;
42        }
43        let hashes = trimmed.bytes().take_while(|&b| b == b'#').count();
44        if hashes > 6 {
45            continue;
46        }
47        let rest = &trimmed[hashes..];
48        if !rest.is_empty() && !rest.starts_with(' ') {
49            continue;
50        }
51        headings.push((idx + 1, hashes));
52    }
53
54    if headings.is_empty() {
55        return violations;
56    }
57
58    if headings[0].1 != 1 {
59        violations.push(DocViolation {
60            line: headings[0].0,
61            rule: "heading-hierarchy",
62            message: format!("first heading must be H1, found H{}", headings[0].1),
63        });
64    }
65
66    for &(line, level) in &headings[1..] {
67        if level == 1 {
68            violations.push(DocViolation {
69                line,
70                rule: "heading-hierarchy",
71                message: "duplicate H1 — exactly one H1 allowed per document".into(),
72            });
73        }
74    }
75
76    for i in 1..headings.len() {
77        let (_, prev) = headings[i - 1];
78        let (line, curr) = headings[i];
79        if curr > prev + 1 {
80            violations.push(DocViolation {
81                line,
82                rule: "heading-hierarchy",
83                message: format!(
84                    "heading level skip: H{curr} follows H{prev} (expected H{} or lower)",
85                    prev + 1
86                ),
87            });
88        }
89    }
90
91    violations
92}
93
94/// Validate `[text](url)` and `![alt](src)` links for empty URLs, `javascript:` scheme, and spaces.
95pub fn validate_links(md: &str) -> Vec<DocViolation> {
96    let mut violations = Vec::new();
97    let mut in_fence = false;
98
99    for (idx, line) in md.lines().enumerate() {
100        let trimmed_check = line.trim_start();
101        if trimmed_check.starts_with("```") {
102            in_fence = !in_fence;
103            continue;
104        }
105        if in_fence {
106            continue;
107        }
108        let line_num = idx + 1;
109        let bytes = line.as_bytes();
110        let len = bytes.len();
111        let mut i = 0;
112
113        while i < len {
114            if i + 1 < len && bytes[i] == b']' && bytes[i + 1] == b'(' && bytes[..i].contains(&b'[')
115            {
116                let url_start = i + 2;
117                let mut depth = 1u32;
118                let mut url_end = url_start;
119                while url_end < len && depth > 0 {
120                    match bytes[url_end] {
121                        b'(' => depth += 1,
122                        b')' => depth -= 1,
123                        _ => {}
124                    }
125                    if depth > 0 {
126                        url_end += 1;
127                    }
128                }
129                let url = &line[url_start..url_end];
130                if url.is_empty() {
131                    violations.push(DocViolation {
132                        line: line_num,
133                        rule: "link-wellformedness",
134                        message: "link URL is empty".into(),
135                    });
136                } else {
137                    if url.starts_with("javascript:") {
138                        violations.push(DocViolation {
139                            line: line_num,
140                            rule: "link-wellformedness",
141                            message: format!("link URL uses javascript: scheme (XSS risk): {url}"),
142                        });
143                    }
144                    if url.contains(' ') {
145                        violations.push(DocViolation {
146                            line: line_num,
147                            rule: "link-wellformedness",
148                            message: format!("link URL contains unescaped space: {url}"),
149                        });
150                    }
151                }
152                i = url_end + 1;
153            } else {
154                i += 1;
155            }
156        }
157    }
158    violations
159}
160
161/// Validate code fences — flags bare triple-backtick fences without a language tag.
162///
163/// Tracks open/close state so only opening fences (not closing fences) are
164/// checked for a language tag.
165pub fn validate_code_fences(md: &str) -> Vec<DocViolation> {
166    let mut violations = Vec::new();
167    let mut in_fence = false;
168
169    for (idx, line) in md.lines().enumerate() {
170        let trimmed = line.trim();
171        if trimmed.starts_with("```") {
172            if in_fence {
173                // Closing fence — no language tag expected.
174                in_fence = false;
175            } else {
176                // Opening fence — check for language tag.
177                in_fence = true;
178                if trimmed[3..].trim().is_empty() {
179                    violations.push(DocViolation {
180                        line: idx + 1,
181                        rule: "code-fence-language",
182                        message: "code fence without language tag".into(),
183                    });
184                }
185            }
186        }
187    }
188    violations
189}
190
191/// Validate GFM tables — checks column count consistency across header, separator, and rows.
192pub fn validate_tables(md: &str) -> Vec<DocViolation> {
193    let mut violations = Vec::new();
194    let lines: Vec<&str> = md.lines().collect();
195    let mut i = 0;
196    let mut in_fence = false;
197
198    while i < lines.len() {
199        let trimmed_check = lines[i].trim_start();
200        if trimmed_check.starts_with("```") {
201            in_fence = !in_fence;
202            i += 1;
203            continue;
204        }
205        if in_fence {
206            i += 1;
207            continue;
208        }
209        let line = lines[i].trim();
210        if !line.starts_with('|') {
211            i += 1;
212            continue;
213        }
214        let header_cols = count_table_columns(line);
215        if i + 1 >= lines.len() {
216            i += 1;
217            continue;
218        }
219        let sep_line = lines[i + 1].trim();
220        if !is_table_separator(sep_line) {
221            i += 1;
222            continue;
223        }
224
225        let sep_cols = count_table_columns(sep_line);
226        if sep_cols != header_cols {
227            violations.push(DocViolation {
228                line: i + 2,
229                rule: "table-column-parity",
230                message: format!("separator has {sep_cols} columns, header has {header_cols}"),
231            });
232        }
233
234        let mut j = i + 2;
235        while j < lines.len() {
236            let row = lines[j].trim();
237            if !row.starts_with('|') {
238                break;
239            }
240            let row_cols = count_table_columns(row);
241            if row_cols != header_cols {
242                violations.push(DocViolation {
243                    line: j + 1,
244                    rule: "table-column-parity",
245                    message: format!("row has {row_cols} columns, header has {header_cols}"),
246                });
247            }
248            j += 1;
249        }
250        i = j;
251    }
252    violations
253}
254
255fn count_table_columns(row: &str) -> usize {
256    let trimmed = row.trim();
257    let inner = trimmed.strip_prefix('|').unwrap_or(trimmed);
258    let inner = inner.strip_suffix('|').unwrap_or(inner);
259    if inner.trim().is_empty() {
260        return 0;
261    }
262    inner.split('|').count()
263}
264
265fn is_table_separator(line: &str) -> bool {
266    let trimmed = line.trim();
267    if !trimmed.contains('|') || !trimmed.contains('-') {
268        return false;
269    }
270    let inner = trimmed.strip_prefix('|').unwrap_or(trimmed);
271    let inner = inner.strip_suffix('|').unwrap_or(inner);
272    inner.split('|').all(|cell| {
273        let c = cell.trim();
274        !c.is_empty() && c.chars().all(|ch| ch == '-' || ch == ':')
275    })
276}
277
278/// Validate SVG structural safety (string-based, no XML parser).
279pub fn validate_svg(content: &str) -> Vec<DocViolation> {
280    let mut violations = Vec::new();
281    let lower = content.to_ascii_lowercase();
282
283    if !lower.contains("<svg") {
284        violations.push(DocViolation {
285            line: 1,
286            rule: "svg-structural-safety",
287            message: "missing <svg> root element".into(),
288        });
289        return violations;
290    }
291    if !content.contains("viewBox") {
292        violations.push(DocViolation {
293            line: 1,
294            rule: "svg-structural-safety",
295            message: "missing viewBox attribute on <svg>".into(),
296        });
297    }
298    for (tag, msg) in [
299        ("<script", "SVG contains <script> tag (XSS risk)"),
300        ("<foreignobject", "SVG contains <foreignObject> tag"),
301    ] {
302        if lower.contains(tag) {
303            for (idx, line) in content.lines().enumerate() {
304                if line.to_ascii_lowercase().contains(tag) {
305                    violations.push(DocViolation {
306                        line: idx + 1,
307                        rule: "svg-structural-safety",
308                        message: msg.into(),
309                    });
310                }
311            }
312        }
313    }
314    let has_xmlns = content.contains("xmlns=\"http://www.w3.org/2000/svg\"")
315        || content.contains("xmlns='http://www.w3.org/2000/svg'");
316    if !has_xmlns {
317        violations.push(DocViolation {
318            line: 1,
319            rule: "svg-structural-safety",
320            message: "missing xmlns=\"http://www.w3.org/2000/svg\" namespace".into(),
321        });
322    }
323    violations
324}
325
326/// Check that each required section name appears as a heading. Returns missing names.
327pub fn validate_required_sections(md: &str, required: &[&str]) -> Vec<String> {
328    let mut in_fence = false;
329    let headings: Vec<String> = md
330        .lines()
331        .filter_map(|line| {
332            let trimmed = line.trim_start();
333            if trimmed.starts_with("```") {
334                in_fence = !in_fence;
335                return None;
336            }
337            if in_fence {
338                return None;
339            }
340            if trimmed.starts_with('#') {
341                let text = trimmed.trim_start_matches('#').trim();
342                if !text.is_empty() {
343                    return Some(text.to_string());
344                }
345            }
346            None
347        })
348        .collect();
349
350    required
351        .iter()
352        .filter(|&&s| !headings.iter().any(|h| h.eq_ignore_ascii_case(s)))
353        .map(ToString::to_string)
354        .collect()
355}
356
357/// Detect drift between actual and generated README content.
358pub fn detect_readme_drift(actual: &str, generated: &str) -> DriftResult {
359    let norm = |s: &str| -> Vec<String> { s.lines().map(|l| l.trim_end().to_string()).collect() };
360    let a = norm(actual);
361    let g = norm(generated);
362    let max_len = a.len().max(g.len());
363    let mut diff_count = 0usize;
364    for i in 0..max_len {
365        if a.get(i).map_or("", String::as_str) != g.get(i).map_or("", String::as_str) {
366            diff_count += 1;
367        }
368    }
369    DriftResult {
370        stale: diff_count > 0,
371        diff_lines: diff_count,
372    }
373}
374
375/// Dispatch to md or svg validator based on file extension.
376pub fn validate_document(path: &Path) -> Vec<DocViolation> {
377    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
378    let read = |p: &Path| -> Result<String, Vec<DocViolation>> {
379        std::fs::read_to_string(p).map_err(|e| {
380            vec![DocViolation {
381                line: 0,
382                rule: "io-error",
383                message: format!("failed to read file: {e}"),
384            }]
385        })
386    };
387    match ext {
388        "md" | "markdown" => {
389            let content = match read(path) {
390                Ok(c) => c,
391                Err(v) => return v,
392            };
393            let mut v = validate_heading_hierarchy(&content);
394            v.extend(validate_links(&content));
395            v.extend(validate_code_fences(&content));
396            v.extend(validate_tables(&content));
397            v
398        }
399        "svg" => match read(path) {
400            Ok(c) => validate_svg(&c),
401            Err(v) => v,
402        },
403        _ => vec![DocViolation {
404            line: 0,
405            rule: "unsupported-extension",
406            message: format!("unsupported file extension: .{ext}"),
407        }],
408    }
409}
410
411#[cfg(test)]
412#[path = "doc_integrity_tests.rs"]
413mod tests;