webspec-index 0.9.0

Query WHATWG/W3C/TC39 web specifications from the command line
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
use crate::db::{queries, write};
use crate::model::ParsedSpec;
use crate::parse;
use anyhow::{Context, Result};
use rusqlite::Connection;

/// Parsed PR preview metadata extracted from a GitHub PR body.
#[derive(Debug, Clone)]
pub struct PrPreview {
    pub pr_number: i64,
    pub head_sha: String,
    pub merge_base_sha: String,
    pub pages: Vec<PrPage>,
}

/// A single preview page from whatpr.org.
#[derive(Debug, Clone)]
pub struct PrPage {
    pub page_path: String,
    pub url: String,
    pub diff_url: Option<String>,
}

/// Parse the preview block from a WHATWG PR body.
///
/// Expects the structured block below the `---` separator, containing
/// `<a href="https://whatpr.org/...">` links with commit SHAs in title attrs.
pub fn parse_pr_body(pr_number: i64, body: &str) -> Result<PrPreview> {
    let preview_block = body
        .split("Don't remove this comment or modify anything below this line.")
        .nth(1)
        .context("PR body has no preview block")?;

    let mut pages = Vec::new();
    let mut head_sha = String::new();
    let mut merge_base_sha = String::new();

    for line in preview_block.lines() {
        if let Some(url) = extract_href(line) {
            if !url.contains("whatpr.org") {
                continue;
            }
            // Skip diff links (they contain "..." in the path)
            if url.contains("...") {
                continue;
            }
            let page_path = url.rsplit('/').next().unwrap_or("").to_string();

            if head_sha.is_empty() {
                if let Some(sha) = extract_sha_from_title(line) {
                    head_sha = sha;
                }
            }

            let diff_url = extract_diff_url(line);
            if merge_base_sha.is_empty() {
                if let Some(ref du) = diff_url {
                    if let Some(base) = extract_merge_base_from_diff_url(du) {
                        merge_base_sha = base;
                    }
                }
            }

            pages.push(PrPage {
                page_path,
                url,
                diff_url,
            });
        }
    }

    if pages.is_empty() {
        anyhow::bail!("No preview pages found in PR body");
    }
    if head_sha.is_empty() {
        anyhow::bail!("Could not extract head SHA from PR body");
    }
    if merge_base_sha.is_empty() {
        anyhow::bail!("Could not extract merge base SHA from PR body");
    }

    Ok(PrPreview {
        pr_number,
        head_sha,
        merge_base_sha,
        pages,
    })
}

fn extract_href(line: &str) -> Option<String> {
    let idx = line.find("href=\"")?;
    let start = idx + 6;
    let rest = &line[start..];
    let end = rest.find('"')?;
    Some(rest[..end].to_string())
}

fn extract_sha_from_title(line: &str) -> Option<String> {
    // title="Last updated on Apr 28, 2026, 4:05 PM UTC (7ceff82)"
    let title_idx = line.find("title=\"")?;
    let after_title = &line[title_idx + 7..];
    let title_end = after_title.find('"')?;
    let title_value = &after_title[..title_end];
    let paren_start = title_value.rfind('(')?;
    let paren_end = title_value.rfind(')')?;
    if paren_start < paren_end {
        Some(title_value[paren_start + 1..paren_end].to_string())
    } else {
        None
    }
}

fn extract_diff_url(line: &str) -> Option<String> {
    let mut search_from = 0;
    loop {
        let rest = &line[search_from..];
        let idx = rest.find("href=\"")?;
        let abs_idx = search_from + idx;
        let start = abs_idx + 6;
        let url_rest = &line[start..];
        let end = url_rest.find('"')?;
        let url = &url_rest[..end];
        if url.contains("...") && url.contains("whatpr.org") {
            return Some(url.to_string());
        }
        search_from = start + end;
        if search_from >= line.len() {
            return None;
        }
    }
}

/// Extract merge base SHA from diff URL.
/// URL format: https://whatpr.org/html/11741/74cbe0a...7ceff82/page.html
fn extract_merge_base_from_diff_url(url: &str) -> Option<String> {
    let parts: Vec<&str> = url.split('/').collect();
    for part in &parts {
        if part.contains("...") {
            return part.split("...").next().map(|s| s.to_string());
        }
    }
    None
}

/// Merge multiple ParsedSpec results (from multi-page fetches) into one.
pub fn merge_parsed_specs(specs: Vec<ParsedSpec>) -> ParsedSpec {
    let mut seen_anchors = std::collections::HashSet::new();
    let mut sections = Vec::new();
    let mut references = Vec::new();
    let mut idl_definitions = Vec::new();
    for spec in specs {
        for section in spec.sections {
            if seen_anchors.insert(section.anchor.clone()) {
                sections.push(section);
            }
        }
        references.extend(spec.references);
        idl_definitions.extend(spec.idl_definitions);
    }
    ParsedSpec {
        sections,
        references,
        idl_definitions,
    }
}

/// Resolve a short SHA to a full SHA via GitHub API.
pub async fn resolve_full_sha(repo: &str, short_sha: &str) -> Result<String> {
    let url = format!("https://api.github.com/repos/{repo}/commits/{short_sha}");
    let client = reqwest::Client::new();
    let resp = client
        .get(&url)
        .header(
            "User-Agent",
            concat!("webspec-index/", env!("CARGO_PKG_VERSION")),
        )
        .header("Accept", "application/vnd.github+json")
        .send()
        .await?
        .error_for_status()?;
    let json: serde_json::Value = resp.json().await?;
    json["sha"]
        .as_str()
        .map(|s| s.to_string())
        .context("GitHub API response missing sha field")
}

/// Fetch the PR body from GitHub API and parse preview metadata.
pub async fn fetch_pr_preview(repo: &str, pr_number: i64) -> Result<PrPreview> {
    let url = format!("https://api.github.com/repos/{repo}/pulls/{pr_number}");
    let client = reqwest::Client::new();
    let resp = client
        .get(&url)
        .header(
            "User-Agent",
            concat!("webspec-index/", env!("CARGO_PKG_VERSION")),
        )
        .header("Accept", "application/vnd.github+json")
        .send()
        .await?
        .error_for_status()
        .context(format!("Failed to fetch PR #{pr_number} from {repo}"))?;
    let json: serde_json::Value = resp.json().await?;
    let body = json["body"].as_str().context("PR has no body")?;
    parse_pr_body(pr_number, body)
}

/// Fetch all preview pages from whatpr.org for a PR and parse them.
async fn fetch_pr_pages(
    preview: &PrPreview,
    spec_name: &str,
    base_url: &str,
) -> Result<ParsedSpec> {
    let mut parsed_pages = Vec::new();
    for page in &preview.pages {
        eprintln!(
            "Fetching PR #{} page: {}",
            preview.pr_number, page.page_path
        );
        let html = super::fetch_raw_html(&page.url).await?;
        let parsed = parse::parse_spec(&html, spec_name, base_url)?;
        parsed_pages.push(parsed);
    }
    Ok(merge_parsed_specs(parsed_pages))
}

/// Fetch the merge base spec from WHATWG commit snapshots.
async fn fetch_merge_base(spec_name: &str, base_url: &str, full_sha: &str) -> Result<ParsedSpec> {
    let host = base_url
        .trim_start_matches("https://")
        .trim_end_matches('/');
    let url = format!("https://{host}/commit-snapshots/{full_sha}/");
    eprintln!(
        "Fetching merge base {}: {}",
        spec_name,
        &url[..url.len().min(80)]
    );
    let html = super::fetch_raw_html(&url).await?;
    parse::parse_spec(&html, spec_name, base_url)
}

fn is_pr_snapshot_valid(conn: &Connection, snapshot_id: i64) -> bool {
    conn.query_row(
        "SELECT COUNT(*) FROM sections WHERE snapshot_id = ?1",
        [snapshot_id],
        |row| row.get::<_, i64>(0),
    )
    .map(|count| count > 0)
    .unwrap_or(false)
}

/// Ensure a PR snapshot is indexed and fresh.
///
/// Returns (pr_snapshot_id, merge_base_snapshot_id).
/// If the PR is already indexed with the same head SHA (and was indexed within
/// the last 24h when `force` is false), returns cached IDs without hitting GitHub.
pub async fn ensure_pr_indexed(
    conn: &Connection,
    spec_name: &str,
    base_url: &str,
    provider: &str,
    pr_number: i64,
    force: bool,
) -> Result<(i64, i64)> {
    let spec_id = write::insert_or_get_spec(conn, spec_name, base_url, provider)?;

    // Fast path: if not forcing, check 24h freshness before hitting the GitHub API.
    if !force {
        if let Some((pr_snap_id, stored_base_sha)) =
            queries::get_pr_snapshot(conn, spec_name, pr_number)?
        {
            if is_pr_snapshot_valid(conn, pr_snap_id) {
                let indexed_at: String = conn.query_row(
                    "SELECT indexed_at FROM snapshots WHERE id = ?1",
                    [pr_snap_id],
                    |row| row.get(0),
                )?;
                if let Ok(indexed) = chrono::DateTime::parse_from_rfc3339(&indexed_at) {
                    let indexed_utc = indexed.with_timezone(&chrono::Utc);
                    if super::is_fresh(&indexed_utc, &chrono::Utc::now()) {
                        if let Some(base_snap_id) =
                            queries::get_commit_snapshot(conn, spec_id, &stored_base_sha)?
                        {
                            return Ok((pr_snap_id, base_snap_id));
                        }
                    }
                }
            }
        }
    }

    // Determine the WHATWG repo name from spec name
    let repo = format!("whatwg/{}", spec_name.to_lowercase());

    // Fetch PR preview metadata from GitHub
    let preview = fetch_pr_preview(&repo, pr_number).await?;

    // Check if we already have this PR indexed with the same head SHA
    if let Some((pr_snap_id, stored_base_sha)) =
        queries::get_pr_snapshot(conn, spec_name, pr_number)?
    {
        let pr_sha: String = conn.query_row(
            "SELECT sha FROM snapshots WHERE id = ?1",
            [pr_snap_id],
            |row| row.get(0),
        )?;
        if pr_sha.ends_with(&preview.head_sha) && is_pr_snapshot_valid(conn, pr_snap_id) {
            // Still fresh — find the merge base snapshot
            if let Some(base_snap_id) =
                queries::get_commit_snapshot(conn, spec_id, &stored_base_sha)?
            {
                return Ok((pr_snap_id, base_snap_id));
            }
        }
        // Stale — delete old PR data
        write::delete_pr_data(conn, spec_id, pr_number)?;
    }

    // Resolve short merge base SHA to full SHA
    let full_base_sha = resolve_full_sha(&repo, &preview.merge_base_sha).await?;

    // Fetch or reuse merge base snapshot
    let base_snap_id =
        if let Some(id) = queries::get_commit_snapshot(conn, spec_id, &full_base_sha)? {
            id
        } else {
            let base_parsed = fetch_merge_base(spec_name, base_url, &full_base_sha).await?;
            let commit_date = chrono::Utc::now().to_rfc3339();
            let id = write::insert_snapshot(conn, spec_id, &full_base_sha, &commit_date)?;
            write::insert_sections_bulk(conn, id, &base_parsed.sections)?;
            write::insert_refs_bulk(conn, id, &base_parsed.references)?;
            write::insert_idl_defs_bulk(conn, id, &base_parsed.idl_definitions)?;
            id
        };

    // Fetch and parse PR pages
    let pr_parsed = fetch_pr_pages(&preview, spec_name, base_url).await?;
    let pr_sha = format!("pr:{}:{}", pr_number, preview.head_sha);
    let commit_date = chrono::Utc::now().to_rfc3339();
    let page_paths: Vec<String> = preview.pages.iter().map(|p| p.page_path.clone()).collect();
    let pr_snap_id = write::insert_pr_snapshot(
        conn,
        spec_id,
        &pr_sha,
        &commit_date,
        pr_number,
        &full_base_sha,
        &page_paths,
    )?;
    write::insert_sections_bulk(conn, pr_snap_id, &pr_parsed.sections)?;
    write::insert_refs_bulk(conn, pr_snap_id, &pr_parsed.references)?;
    write::insert_idl_defs_bulk(conn, pr_snap_id, &pr_parsed.idl_definitions)?;

    Ok((pr_snap_id, base_snap_id))
}

#[cfg(test)]
mod tests {
    use super::*;

    const SAMPLE_PR_BODY: &str = r#"Some PR description text here.

<!--
    This comment and the below content is programmatically generated.
    You may add a comma-separated list of anchors you'd like a
    direct link to below (e.g. #idl-serializers, #idl-sequence):

    Don't remove this comment or modify anything below this line.
    If you don't want a preview generated for this pull request,
    just replace the whole of this comment's content by "no preview"
    and remove what's below.
-->
***
<a href="https://whatpr.org/html/11741/form-control-infrastructure.html" title="Last updated on Apr 28, 2026, 4:05 PM UTC (7ceff82)">/form-control-infrastructure.html</a>  ( <a href="https://whatpr.org/html/11741/74cbe0a...7ceff82/form-control-infrastructure.html" title="Last updated on Apr 28, 2026, 4:05 PM UTC (7ceff82)">diff</a> )
<a href="https://whatpr.org/html/11741/form-elements.html" title="Last updated on Apr 28, 2026, 4:05 PM UTC (7ceff82)">/form-elements.html</a>  ( <a href="https://whatpr.org/html/11741/74cbe0a...7ceff82/form-elements.html" title="Last updated on Apr 28, 2026, 4:05 PM UTC (7ceff82)">diff</a> )
<a href="https://whatpr.org/html/11741/infrastructure.html" title="Last updated on Apr 28, 2026, 4:05 PM UTC (7ceff82)">/infrastructure.html</a>  ( <a href="https://whatpr.org/html/11741/74cbe0a...7ceff82/infrastructure.html" title="Last updated on Apr 28, 2026, 4:05 PM UTC (7ceff82)">diff</a> )
<a href="https://whatpr.org/html/11741/input.html" title="Last updated on Apr 28, 2026, 4:05 PM UTC (7ceff82)">/input.html</a>  ( <a href="https://whatpr.org/html/11741/74cbe0a...7ceff82/input.html" title="Last updated on Apr 28, 2026, 4:05 PM UTC (7ceff82)">diff</a> )"#;

    #[test]
    fn test_parse_pr_body_extracts_pages() {
        let preview = parse_pr_body(11741, SAMPLE_PR_BODY).unwrap();
        assert_eq!(preview.pr_number, 11741);
        assert_eq!(preview.head_sha, "7ceff82");
        assert_eq!(preview.merge_base_sha, "74cbe0a");
        assert_eq!(preview.pages.len(), 4);
        assert_eq!(
            preview.pages[0].page_path,
            "form-control-infrastructure.html"
        );
        assert_eq!(
            preview.pages[0].url,
            "https://whatpr.org/html/11741/form-control-infrastructure.html"
        );
        assert!(preview.pages[0].diff_url.is_some());
    }

    #[test]
    fn test_parse_pr_body_no_preview_block() {
        let result = parse_pr_body(1, "Just a regular PR body without preview");
        assert!(result.is_err());
    }

    #[test]
    fn test_extract_merge_base_from_diff_url() {
        let url = "https://whatpr.org/html/11741/74cbe0a...7ceff82/form-elements.html";
        assert_eq!(
            extract_merge_base_from_diff_url(url),
            Some("74cbe0a".to_string())
        );
    }

    #[test]
    fn test_empty_pr_snapshot_not_treated_as_cached() {
        use crate::db;
        use crate::db::write;

        let conn = db::open_test_db().unwrap();
        let spec_id =
            write::insert_or_get_spec(&conn, "HTML", "https://html.spec.whatwg.org", "whatwg")
                .unwrap();

        write::insert_pr_snapshot(
            &conn,
            spec_id,
            "pr:99:deadbeef",
            "2026-01-01T00:00:00Z",
            99,
            "basesha",
            &[],
        )
        .unwrap();
        write::insert_snapshot(&conn, spec_id, "basesha", "2026-01-01T00:00:00Z").unwrap();

        let pr_snap_id: i64 = conn
            .query_row(
                "SELECT id FROM snapshots WHERE sha = 'pr:99:deadbeef'",
                [],
                |row| row.get(0),
            )
            .unwrap();
        assert!(!is_pr_snapshot_valid(&conn, pr_snap_id));
    }

    #[test]
    fn test_merge_parsed_specs() {
        use crate::model::{ParsedReference, ParsedSection, ParsedSpec, SectionType};

        let spec1 = ParsedSpec {
            sections: vec![ParsedSection {
                anchor: "sec-a".into(),
                title: Some("A".into()),
                content_text: None,
                section_type: SectionType::Heading,
                parent_anchor: None,
                prev_anchor: None,
                next_anchor: None,
                depth: Some(2),
            }],
            references: vec![],
            idl_definitions: vec![],
        };
        let spec2 = ParsedSpec {
            sections: vec![ParsedSection {
                anchor: "sec-b".into(),
                title: Some("B".into()),
                content_text: None,
                section_type: SectionType::Heading,
                parent_anchor: None,
                prev_anchor: None,
                next_anchor: None,
                depth: Some(2),
            }],
            references: vec![ParsedReference {
                from_anchor: "sec-b".into(),
                to_spec: "DOM".into(),
                to_anchor: "concept-tree".into(),
            }],
            idl_definitions: vec![],
        };

        let merged = merge_parsed_specs(vec![spec1, spec2]);
        assert_eq!(merged.sections.len(), 2);
        assert_eq!(merged.references.len(), 1);
    }
}