Skip to main content

webspec_index/
spec_list.rs

1use anyhow::{Context, Result};
2use rusqlite::Connection;
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5use std::path::Path;
6use std::process::Command;
7
8const CSSWG_URL: &str = "https://github.com/w3c/csswg-drafts";
9const GROUPS_URL: &str = "https://github.com/w3c/groups";
10const BUNDLED_SPEC_LIST: &str = include_str!("../data/w3c_specs.json");
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct SpecEntry {
14    pub name: String,
15    pub base_url: String,
16    pub provider: String,
17    pub github_repo: String,
18}
19
20/// Seed the DB from the bundled W3C spec list and known non-W3C specs.
21pub fn fetch_and_seed(conn: &Connection) -> Result<usize> {
22    let entries: Vec<SpecEntry> = serde_json::from_str(BUNDLED_SPEC_LIST)
23        .context("Failed to parse bundled w3c_specs.json")?;
24    let mut count = entries.len();
25    for e in &entries {
26        crate::db::write::seed_spec(conn, &e.name, &e.base_url, &e.provider)?;
27    }
28
29    let known = crate::spec_registry::known_specs();
30    count += known.len();
31    for (name, base_url, provider) in &known {
32        crate::db::write::seed_spec(conn, name, base_url, provider)?;
33    }
34
35    Ok(count)
36}
37
38/// Update the W3C spec list from csswg-drafts and w3c/groups.
39///
40/// This covers W3C specs only. WHATWG specs (HTML, DOM, Fetch, …) and TC39
41/// specs (ECMAScript, …) are small, stable lists hardcoded in their respective
42/// providers (`src/provider/whatwg.rs`, `src/provider/tc39.rs`).
43pub fn update(
44    csswg_dir: &Path,
45    groups_dir: &Path,
46    output: &Path,
47) -> Result<(usize, usize, Vec<SpecEntry>)> {
48    clone_or_update(CSSWG_URL, csswg_dir)?;
49    clone_or_update(GROUPS_URL, groups_dir)?;
50
51    let csswg = collect_csswg(csswg_dir);
52    let standalone = collect_standalone(groups_dir)?;
53    let csswg_count = csswg.len();
54    let standalone_count = standalone.len();
55
56    let mut all = csswg;
57    all.extend(standalone);
58    resolve_collisions(&mut all);
59
60    let mut seen_names = std::collections::HashSet::new();
61    let mut seen_urls = std::collections::HashSet::new();
62    all.retain(|e| seen_names.insert(e.name.clone()) && seen_urls.insert(e.base_url.clone()));
63    all.sort_by(|a, b| a.name.cmp(&b.name));
64
65    if let Some(parent) = output.parent() {
66        std::fs::create_dir_all(parent)?;
67    }
68    let json = serde_json::to_string_pretty(&all)?;
69    std::fs::write(output, format!("{}\n", json))
70        .with_context(|| format!("Failed to write {}", output.display()))?;
71
72    Ok((csswg_count, standalone_count, all))
73}
74
75fn clone_or_update(url: &str, local_path: &Path) -> Result<()> {
76    if local_path.join(".git").is_dir() {
77        eprintln!("Updating {} ...", local_path.display());
78        let status = Command::new("git")
79            .args(["-C", local_path.to_str().unwrap(), "pull", "--depth=1"])
80            .status()
81            .with_context(|| format!("Failed to run git pull in {}", local_path.display()))?;
82        if !status.success() {
83            anyhow::bail!("git pull failed in {}", local_path.display());
84        }
85    } else {
86        eprintln!("Cloning {} into {} ...", url, local_path.display());
87        let status = Command::new("git")
88            .args(["clone", "--depth=1", url, local_path.to_str().unwrap()])
89            .status()
90            .with_context(|| format!("Failed to clone {}", url))?;
91        if !status.success() {
92            anyhow::bail!("git clone failed for {}", url);
93        }
94    }
95    Ok(())
96}
97
98fn collect_csswg(csswg_dir: &Path) -> Vec<SpecEntry> {
99    let mut entries = Vec::new();
100    let skip = ["bin", "css-module"];
101    let read_dir = match std::fs::read_dir(csswg_dir) {
102        Ok(d) => d,
103        Err(e) => {
104            eprintln!("warning: cannot read {}: {}", csswg_dir.display(), e);
105            return entries;
106        }
107    };
108    let mut dirs: Vec<_> = read_dir.flatten().filter(|e| e.path().is_dir()).collect();
109    dirs.sort_by_key(|e| e.file_name());
110
111    for entry in dirs {
112        let dir_name = entry.file_name();
113        let dir_name = dir_name.to_string_lossy();
114        if dir_name.starts_with('.') || skip.contains(&dir_name.as_ref()) {
115            continue;
116        }
117        let has_bs = std::fs::read_dir(entry.path())
118            .ok()
119            .map(|rd| {
120                rd.flatten()
121                    .any(|f| f.file_name().to_string_lossy().ends_with(".bs"))
122            })
123            .unwrap_or(false);
124        if !has_bs {
125            continue;
126        }
127        entries.push(SpecEntry {
128            name: dir_name.to_uppercase(),
129            base_url: format!("https://drafts.csswg.org/{}", dir_name),
130            provider: "w3c".to_string(),
131            github_repo: "w3c/csswg-drafts".to_string(),
132        });
133    }
134    entries
135}
136
137fn collect_standalone(groups_dir: &Path) -> Result<Vec<SpecEntry>> {
138    let repos_path = groups_dir.join("repositories.json");
139    let data = std::fs::read_to_string(&repos_path)
140        .with_context(|| format!("Failed to read {}", repos_path.display()))?;
141    let repos: Vec<serde_json::Value> =
142        serde_json::from_str(&data).context("Failed to parse repositories.json")?;
143
144    let mut entries = Vec::new();
145    for r in &repos {
146        if r.get("isArchived")
147            .and_then(|v| v.as_bool())
148            .unwrap_or(false)
149        {
150            continue;
151        }
152        if r.get("isPrivate")
153            .and_then(|v| v.as_bool())
154            .unwrap_or(false)
155        {
156            continue;
157        }
158        let types: Vec<&str> = r
159            .get("w3cjson")
160            .and_then(|v| v.get("repo-type"))
161            .and_then(|v| v.as_array())
162            .map(|a| a.iter().filter_map(|v| v.as_str()).collect())
163            .unwrap_or_default();
164        if !types.contains(&"rec-track") && !types.contains(&"cg-report") {
165            continue;
166        }
167        let owner = r
168            .get("owner")
169            .and_then(|v| v.get("login"))
170            .and_then(|v| v.as_str())
171            .unwrap_or("");
172        let repo_name = r.get("name").and_then(|v| v.as_str()).unwrap_or("");
173        if owner.is_empty() || repo_name.is_empty() {
174            continue;
175        }
176        if owner == "w3c" && repo_name == "csswg-drafts" {
177            continue;
178        }
179        if owner == "WebAssembly" {
180            continue;
181        }
182
183        let hp_raw = r
184            .get("homepageUrl")
185            .and_then(|v| v.as_str())
186            .unwrap_or("")
187            .trim_end_matches('/')
188            .replace("http://", "https://");
189        let hp = if !hp_raw.is_empty() && !hp_raw.starts_with("https://") {
190            format!("https://{}", hp_raw)
191        } else {
192            hp_raw
193        };
194
195        let base_url = if hp.contains(".github.io") && !hp.ends_with(".github.io") {
196            hp
197        } else if owner == "w3c" && (hp.starts_with("https://www.w3.org/TR/") || hp.is_empty()) {
198            format!("https://w3c.github.io/{}", repo_name)
199        } else {
200            continue;
201        };
202
203        entries.push(SpecEntry {
204            name: repo_name.to_uppercase(),
205            base_url,
206            provider: "w3c".to_string(),
207            github_repo: format!("{}/{}", owner, repo_name),
208        });
209    }
210    Ok(entries)
211}
212
213fn resolve_collisions(entries: &mut [SpecEntry]) {
214    let mut counts: HashMap<String, usize> = HashMap::new();
215    for e in entries.iter() {
216        *counts.entry(e.name.clone()).or_insert(0) += 1;
217    }
218    for e in entries.iter_mut() {
219        if counts[&e.name] > 1 {
220            let org = e.github_repo.split('/').next().unwrap_or("").to_uppercase();
221            e.name = format!("{}-{}", e.name, org);
222        }
223    }
224    let mut counts: HashMap<String, usize> = HashMap::new();
225    for e in entries.iter() {
226        *counts.entry(e.name.clone()).or_insert(0) += 1;
227    }
228    for e in entries.iter_mut() {
229        if counts[&e.name] > 1 {
230            e.name = e.github_repo.replace('/', "-").to_uppercase();
231        }
232    }
233}
234
235#[cfg(test)]
236mod tests {
237    use super::*;
238
239    fn make_repo(
240        owner: &str,
241        name: &str,
242        homepage: &str,
243        repo_types: &[&str],
244    ) -> serde_json::Value {
245        serde_json::json!({
246            "name": name,
247            "owner": {"login": owner},
248            "homepageUrl": homepage,
249            "isArchived": false,
250            "isPrivate": false,
251            "w3cjson": {
252                "repo-type": repo_types
253            }
254        })
255    }
256
257    #[test]
258    fn test_collect_standalone_github_io_url() {
259        let repos = serde_json::json!([make_repo(
260            "w3c",
261            "webcodecs",
262            "https://w3c.github.io/webcodecs/",
263            &["rec-track"]
264        )]);
265        let dir = tempfile::tempdir().unwrap();
266        let path = dir.path().join("repositories.json");
267        std::fs::write(&path, repos.to_string()).unwrap();
268        let entries = collect_standalone(dir.path()).unwrap();
269        assert_eq!(entries.len(), 1);
270        assert_eq!(entries[0].name, "WEBCODECS");
271        assert_eq!(entries[0].base_url, "https://w3c.github.io/webcodecs");
272        assert_eq!(entries[0].github_repo, "w3c/webcodecs");
273    }
274
275    #[test]
276    fn test_collect_standalone_tr_url_becomes_github_io() {
277        let repos = serde_json::json!([make_repo(
278            "w3c",
279            "permissions",
280            "https://www.w3.org/TR/permissions/",
281            &["rec-track"]
282        )]);
283        let dir = tempfile::tempdir().unwrap();
284        let path = dir.path().join("repositories.json");
285        std::fs::write(&path, repos.to_string()).unwrap();
286        let entries = collect_standalone(dir.path()).unwrap();
287        assert_eq!(entries.len(), 1);
288        assert_eq!(entries[0].base_url, "https://w3c.github.io/permissions");
289    }
290
291    #[test]
292    fn test_collect_standalone_bare_hostname_gets_https() {
293        let repos = serde_json::json!([make_repo(
294            "w3c",
295            "rdf-tests",
296            "w3c.github.io/rdf-tests",
297            &["rec-track"]
298        )]);
299        let dir = tempfile::tempdir().unwrap();
300        let path = dir.path().join("repositories.json");
301        std::fs::write(&path, repos.to_string()).unwrap();
302        let entries = collect_standalone(dir.path()).unwrap();
303        assert_eq!(entries.len(), 1);
304        assert_eq!(entries[0].base_url, "https://w3c.github.io/rdf-tests");
305    }
306
307    #[test]
308    fn test_collect_standalone_skips_archived() {
309        let mut r = make_repo(
310            "w3c",
311            "old-spec",
312            "https://w3c.github.io/old-spec/",
313            &["rec-track"],
314        );
315        r["isArchived"] = serde_json::json!(true);
316        let repos = serde_json::json!([r]);
317        let dir = tempfile::tempdir().unwrap();
318        std::fs::write(dir.path().join("repositories.json"), repos.to_string()).unwrap();
319        let entries = collect_standalone(dir.path()).unwrap();
320        assert!(entries.is_empty());
321    }
322
323    #[test]
324    fn test_collect_standalone_skips_non_spec_types() {
325        let repos = serde_json::json!([
326            make_repo("w3c", "tests", "https://w3c.github.io/tests/", &["tests"]),
327            make_repo("w3c", "tool", "https://w3c.github.io/tool/", &["tool"]),
328        ]);
329        let dir = tempfile::tempdir().unwrap();
330        std::fs::write(dir.path().join("repositories.json"), repos.to_string()).unwrap();
331        let entries = collect_standalone(dir.path()).unwrap();
332        assert!(entries.is_empty());
333    }
334
335    #[test]
336    fn test_collect_standalone_includes_cg_report() {
337        let repos = serde_json::json!([make_repo(
338            "WICG",
339            "keyboard-lock",
340            "https://wicg.github.io/keyboard-lock/",
341            &["cg-report"]
342        )]);
343        let dir = tempfile::tempdir().unwrap();
344        std::fs::write(dir.path().join("repositories.json"), repos.to_string()).unwrap();
345        let entries = collect_standalone(dir.path()).unwrap();
346        assert_eq!(entries.len(), 1);
347        assert_eq!(entries[0].name, "KEYBOARD-LOCK");
348    }
349
350    #[test]
351    fn test_collect_standalone_skips_csswg_monorepo() {
352        let repos = serde_json::json!([make_repo(
353            "w3c",
354            "csswg-drafts",
355            "https://drafts.csswg.org/index.html",
356            &["rec-track"]
357        )]);
358        let dir = tempfile::tempdir().unwrap();
359        std::fs::write(dir.path().join("repositories.json"), repos.to_string()).unwrap();
360        let entries = collect_standalone(dir.path()).unwrap();
361        assert!(entries.is_empty());
362    }
363
364    #[test]
365    fn test_collect_standalone_skips_webassembly_org() {
366        let repos = serde_json::json!([make_repo(
367            "WebAssembly",
368            "threads",
369            "https://webassembly.github.io/threads/",
370            &["rec-track"]
371        )]);
372        let dir = tempfile::tempdir().unwrap();
373        std::fs::write(dir.path().join("repositories.json"), repos.to_string()).unwrap();
374        let entries = collect_standalone(dir.path()).unwrap();
375        assert!(entries.is_empty());
376    }
377
378    #[test]
379    fn test_resolve_collisions_disambiguates_by_org() {
380        let mut entries = vec![
381            SpecEntry {
382                name: "SPEC".into(),
383                base_url: "https://foo.github.io/spec".into(),
384                provider: "w3c".into(),
385                github_repo: "foo/spec".into(),
386            },
387            SpecEntry {
388                name: "SPEC".into(),
389                base_url: "https://bar.github.io/spec".into(),
390                provider: "w3c".into(),
391                github_repo: "bar/spec".into(),
392            },
393        ];
394        resolve_collisions(&mut entries);
395        let names: Vec<&str> = entries.iter().map(|e| e.name.as_str()).collect();
396        assert!(names.contains(&"SPEC-FOO"));
397        assert!(names.contains(&"SPEC-BAR"));
398    }
399
400    #[test]
401    fn test_no_duplicate_names_or_urls_in_generated_list() {
402        let data = std::fs::read_to_string("data/w3c_specs.json");
403        if data.is_err() {
404            return; // Skip if not generated yet
405        }
406        let specs: Vec<SpecEntry> = serde_json::from_str(&data.unwrap()).unwrap();
407        let mut names: Vec<&str> = specs.iter().map(|s| s.name.as_str()).collect();
408        names.sort();
409        let before = names.len();
410        names.dedup();
411        assert_eq!(
412            names.len(),
413            before,
414            "Duplicate names in data/w3c_specs.json"
415        );
416
417        let mut urls: Vec<&str> = specs.iter().map(|s| s.base_url.as_str()).collect();
418        urls.sort();
419        let before = urls.len();
420        urls.dedup();
421        assert_eq!(
422            urls.len(),
423            before,
424            "Duplicate base_urls in data/w3c_specs.json"
425        );
426    }
427
428    #[test]
429    fn test_generated_list_all_https() {
430        let data = std::fs::read_to_string("data/w3c_specs.json");
431        if data.is_err() {
432            return;
433        }
434        let specs: Vec<SpecEntry> = serde_json::from_str(&data.unwrap()).unwrap();
435        for s in &specs {
436            assert!(
437                s.base_url.starts_with("https://"),
438                "Non-https URL in data/w3c_specs.json: {} -> {}",
439                s.name,
440                s.base_url
441            );
442        }
443    }
444}