1use anyhow::{Context, Result};
2use rusqlite::Connection;
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5use std::path::Path;
6use std::process::Command;
7
8const CSSWG_URL: &str = "https://github.com/w3c/csswg-drafts";
9const GROUPS_URL: &str = "https://github.com/w3c/groups";
10const BUNDLED_SPEC_LIST: &str = include_str!("../data/w3c_specs.json");
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct SpecEntry {
14 pub name: String,
15 pub base_url: String,
16 pub provider: String,
17 pub github_repo: String,
18}
19
20pub fn fetch_and_seed(conn: &Connection) -> Result<usize> {
22 let entries: Vec<SpecEntry> = serde_json::from_str(BUNDLED_SPEC_LIST)
23 .context("Failed to parse bundled w3c_specs.json")?;
24 let mut count = entries.len();
25 for e in &entries {
26 crate::db::write::seed_spec(conn, &e.name, &e.base_url, &e.provider)?;
27 }
28
29 let known = crate::spec_registry::known_specs();
30 count += known.len();
31 for (name, base_url, provider) in &known {
32 crate::db::write::seed_spec(conn, name, base_url, provider)?;
33 }
34
35 Ok(count)
36}
37
38pub fn update(
44 csswg_dir: &Path,
45 groups_dir: &Path,
46 output: &Path,
47) -> Result<(usize, usize, Vec<SpecEntry>)> {
48 clone_or_update(CSSWG_URL, csswg_dir)?;
49 clone_or_update(GROUPS_URL, groups_dir)?;
50
51 let csswg = collect_csswg(csswg_dir);
52 let standalone = collect_standalone(groups_dir)?;
53 let csswg_count = csswg.len();
54 let standalone_count = standalone.len();
55
56 let mut all = csswg;
57 all.extend(standalone);
58 resolve_collisions(&mut all);
59
60 let mut seen_names = std::collections::HashSet::new();
61 let mut seen_urls = std::collections::HashSet::new();
62 all.retain(|e| seen_names.insert(e.name.clone()) && seen_urls.insert(e.base_url.clone()));
63 all.sort_by(|a, b| a.name.cmp(&b.name));
64
65 if let Some(parent) = output.parent() {
66 std::fs::create_dir_all(parent)?;
67 }
68 let json = serde_json::to_string_pretty(&all)?;
69 std::fs::write(output, format!("{}\n", json))
70 .with_context(|| format!("Failed to write {}", output.display()))?;
71
72 Ok((csswg_count, standalone_count, all))
73}
74
75fn clone_or_update(url: &str, local_path: &Path) -> Result<()> {
76 if local_path.join(".git").is_dir() {
77 eprintln!("Updating {} ...", local_path.display());
78 let status = Command::new("git")
79 .args(["-C", local_path.to_str().unwrap(), "pull", "--depth=1"])
80 .status()
81 .with_context(|| format!("Failed to run git pull in {}", local_path.display()))?;
82 if !status.success() {
83 anyhow::bail!("git pull failed in {}", local_path.display());
84 }
85 } else {
86 eprintln!("Cloning {} into {} ...", url, local_path.display());
87 let status = Command::new("git")
88 .args(["clone", "--depth=1", url, local_path.to_str().unwrap()])
89 .status()
90 .with_context(|| format!("Failed to clone {}", url))?;
91 if !status.success() {
92 anyhow::bail!("git clone failed for {}", url);
93 }
94 }
95 Ok(())
96}
97
98fn collect_csswg(csswg_dir: &Path) -> Vec<SpecEntry> {
99 let mut entries = Vec::new();
100 let skip = ["bin", "css-module"];
101 let read_dir = match std::fs::read_dir(csswg_dir) {
102 Ok(d) => d,
103 Err(e) => {
104 eprintln!("warning: cannot read {}: {}", csswg_dir.display(), e);
105 return entries;
106 }
107 };
108 let mut dirs: Vec<_> = read_dir.flatten().filter(|e| e.path().is_dir()).collect();
109 dirs.sort_by_key(|e| e.file_name());
110
111 for entry in dirs {
112 let dir_name = entry.file_name();
113 let dir_name = dir_name.to_string_lossy();
114 if dir_name.starts_with('.') || skip.contains(&dir_name.as_ref()) {
115 continue;
116 }
117 let has_bs = std::fs::read_dir(entry.path())
118 .ok()
119 .map(|rd| {
120 rd.flatten()
121 .any(|f| f.file_name().to_string_lossy().ends_with(".bs"))
122 })
123 .unwrap_or(false);
124 if !has_bs {
125 continue;
126 }
127 entries.push(SpecEntry {
128 name: dir_name.to_uppercase(),
129 base_url: format!("https://drafts.csswg.org/{}", dir_name),
130 provider: "w3c".to_string(),
131 github_repo: "w3c/csswg-drafts".to_string(),
132 });
133 }
134 entries
135}
136
137fn collect_standalone(groups_dir: &Path) -> Result<Vec<SpecEntry>> {
138 let repos_path = groups_dir.join("repositories.json");
139 let data = std::fs::read_to_string(&repos_path)
140 .with_context(|| format!("Failed to read {}", repos_path.display()))?;
141 let repos: Vec<serde_json::Value> =
142 serde_json::from_str(&data).context("Failed to parse repositories.json")?;
143
144 let mut entries = Vec::new();
145 for r in &repos {
146 if r.get("isArchived")
147 .and_then(|v| v.as_bool())
148 .unwrap_or(false)
149 {
150 continue;
151 }
152 if r.get("isPrivate")
153 .and_then(|v| v.as_bool())
154 .unwrap_or(false)
155 {
156 continue;
157 }
158 let types: Vec<&str> = r
159 .get("w3cjson")
160 .and_then(|v| v.get("repo-type"))
161 .and_then(|v| v.as_array())
162 .map(|a| a.iter().filter_map(|v| v.as_str()).collect())
163 .unwrap_or_default();
164 if !types.contains(&"rec-track") && !types.contains(&"cg-report") {
165 continue;
166 }
167 let owner = r
168 .get("owner")
169 .and_then(|v| v.get("login"))
170 .and_then(|v| v.as_str())
171 .unwrap_or("");
172 let repo_name = r.get("name").and_then(|v| v.as_str()).unwrap_or("");
173 if owner.is_empty() || repo_name.is_empty() {
174 continue;
175 }
176 if owner == "w3c" && repo_name == "csswg-drafts" {
177 continue;
178 }
179 if owner == "WebAssembly" {
180 continue;
181 }
182
183 let hp_raw = r
184 .get("homepageUrl")
185 .and_then(|v| v.as_str())
186 .unwrap_or("")
187 .trim_end_matches('/')
188 .replace("http://", "https://");
189 let hp = if !hp_raw.is_empty() && !hp_raw.starts_with("https://") {
190 format!("https://{}", hp_raw)
191 } else {
192 hp_raw
193 };
194
195 let base_url = if hp.contains(".github.io") && !hp.ends_with(".github.io") {
196 hp
197 } else if owner == "w3c" && (hp.starts_with("https://www.w3.org/TR/") || hp.is_empty()) {
198 format!("https://w3c.github.io/{}", repo_name)
199 } else {
200 continue;
201 };
202
203 entries.push(SpecEntry {
204 name: repo_name.to_uppercase(),
205 base_url,
206 provider: "w3c".to_string(),
207 github_repo: format!("{}/{}", owner, repo_name),
208 });
209 }
210 Ok(entries)
211}
212
213fn resolve_collisions(entries: &mut [SpecEntry]) {
214 let mut counts: HashMap<String, usize> = HashMap::new();
215 for e in entries.iter() {
216 *counts.entry(e.name.clone()).or_insert(0) += 1;
217 }
218 for e in entries.iter_mut() {
219 if counts[&e.name] > 1 {
220 let org = e.github_repo.split('/').next().unwrap_or("").to_uppercase();
221 e.name = format!("{}-{}", e.name, org);
222 }
223 }
224 let mut counts: HashMap<String, usize> = HashMap::new();
225 for e in entries.iter() {
226 *counts.entry(e.name.clone()).or_insert(0) += 1;
227 }
228 for e in entries.iter_mut() {
229 if counts[&e.name] > 1 {
230 e.name = e.github_repo.replace('/', "-").to_uppercase();
231 }
232 }
233}
234
235#[cfg(test)]
236mod tests {
237 use super::*;
238
239 fn make_repo(
240 owner: &str,
241 name: &str,
242 homepage: &str,
243 repo_types: &[&str],
244 ) -> serde_json::Value {
245 serde_json::json!({
246 "name": name,
247 "owner": {"login": owner},
248 "homepageUrl": homepage,
249 "isArchived": false,
250 "isPrivate": false,
251 "w3cjson": {
252 "repo-type": repo_types
253 }
254 })
255 }
256
257 #[test]
258 fn test_collect_standalone_github_io_url() {
259 let repos = serde_json::json!([make_repo(
260 "w3c",
261 "webcodecs",
262 "https://w3c.github.io/webcodecs/",
263 &["rec-track"]
264 )]);
265 let dir = tempfile::tempdir().unwrap();
266 let path = dir.path().join("repositories.json");
267 std::fs::write(&path, repos.to_string()).unwrap();
268 let entries = collect_standalone(dir.path()).unwrap();
269 assert_eq!(entries.len(), 1);
270 assert_eq!(entries[0].name, "WEBCODECS");
271 assert_eq!(entries[0].base_url, "https://w3c.github.io/webcodecs");
272 assert_eq!(entries[0].github_repo, "w3c/webcodecs");
273 }
274
275 #[test]
276 fn test_collect_standalone_tr_url_becomes_github_io() {
277 let repos = serde_json::json!([make_repo(
278 "w3c",
279 "permissions",
280 "https://www.w3.org/TR/permissions/",
281 &["rec-track"]
282 )]);
283 let dir = tempfile::tempdir().unwrap();
284 let path = dir.path().join("repositories.json");
285 std::fs::write(&path, repos.to_string()).unwrap();
286 let entries = collect_standalone(dir.path()).unwrap();
287 assert_eq!(entries.len(), 1);
288 assert_eq!(entries[0].base_url, "https://w3c.github.io/permissions");
289 }
290
291 #[test]
292 fn test_collect_standalone_bare_hostname_gets_https() {
293 let repos = serde_json::json!([make_repo(
294 "w3c",
295 "rdf-tests",
296 "w3c.github.io/rdf-tests",
297 &["rec-track"]
298 )]);
299 let dir = tempfile::tempdir().unwrap();
300 let path = dir.path().join("repositories.json");
301 std::fs::write(&path, repos.to_string()).unwrap();
302 let entries = collect_standalone(dir.path()).unwrap();
303 assert_eq!(entries.len(), 1);
304 assert_eq!(entries[0].base_url, "https://w3c.github.io/rdf-tests");
305 }
306
307 #[test]
308 fn test_collect_standalone_skips_archived() {
309 let mut r = make_repo(
310 "w3c",
311 "old-spec",
312 "https://w3c.github.io/old-spec/",
313 &["rec-track"],
314 );
315 r["isArchived"] = serde_json::json!(true);
316 let repos = serde_json::json!([r]);
317 let dir = tempfile::tempdir().unwrap();
318 std::fs::write(dir.path().join("repositories.json"), repos.to_string()).unwrap();
319 let entries = collect_standalone(dir.path()).unwrap();
320 assert!(entries.is_empty());
321 }
322
323 #[test]
324 fn test_collect_standalone_skips_non_spec_types() {
325 let repos = serde_json::json!([
326 make_repo("w3c", "tests", "https://w3c.github.io/tests/", &["tests"]),
327 make_repo("w3c", "tool", "https://w3c.github.io/tool/", &["tool"]),
328 ]);
329 let dir = tempfile::tempdir().unwrap();
330 std::fs::write(dir.path().join("repositories.json"), repos.to_string()).unwrap();
331 let entries = collect_standalone(dir.path()).unwrap();
332 assert!(entries.is_empty());
333 }
334
335 #[test]
336 fn test_collect_standalone_includes_cg_report() {
337 let repos = serde_json::json!([make_repo(
338 "WICG",
339 "keyboard-lock",
340 "https://wicg.github.io/keyboard-lock/",
341 &["cg-report"]
342 )]);
343 let dir = tempfile::tempdir().unwrap();
344 std::fs::write(dir.path().join("repositories.json"), repos.to_string()).unwrap();
345 let entries = collect_standalone(dir.path()).unwrap();
346 assert_eq!(entries.len(), 1);
347 assert_eq!(entries[0].name, "KEYBOARD-LOCK");
348 }
349
350 #[test]
351 fn test_collect_standalone_skips_csswg_monorepo() {
352 let repos = serde_json::json!([make_repo(
353 "w3c",
354 "csswg-drafts",
355 "https://drafts.csswg.org/index.html",
356 &["rec-track"]
357 )]);
358 let dir = tempfile::tempdir().unwrap();
359 std::fs::write(dir.path().join("repositories.json"), repos.to_string()).unwrap();
360 let entries = collect_standalone(dir.path()).unwrap();
361 assert!(entries.is_empty());
362 }
363
364 #[test]
365 fn test_collect_standalone_skips_webassembly_org() {
366 let repos = serde_json::json!([make_repo(
367 "WebAssembly",
368 "threads",
369 "https://webassembly.github.io/threads/",
370 &["rec-track"]
371 )]);
372 let dir = tempfile::tempdir().unwrap();
373 std::fs::write(dir.path().join("repositories.json"), repos.to_string()).unwrap();
374 let entries = collect_standalone(dir.path()).unwrap();
375 assert!(entries.is_empty());
376 }
377
378 #[test]
379 fn test_resolve_collisions_disambiguates_by_org() {
380 let mut entries = vec![
381 SpecEntry {
382 name: "SPEC".into(),
383 base_url: "https://foo.github.io/spec".into(),
384 provider: "w3c".into(),
385 github_repo: "foo/spec".into(),
386 },
387 SpecEntry {
388 name: "SPEC".into(),
389 base_url: "https://bar.github.io/spec".into(),
390 provider: "w3c".into(),
391 github_repo: "bar/spec".into(),
392 },
393 ];
394 resolve_collisions(&mut entries);
395 let names: Vec<&str> = entries.iter().map(|e| e.name.as_str()).collect();
396 assert!(names.contains(&"SPEC-FOO"));
397 assert!(names.contains(&"SPEC-BAR"));
398 }
399
400 #[test]
401 fn test_no_duplicate_names_or_urls_in_generated_list() {
402 let data = std::fs::read_to_string("data/w3c_specs.json");
403 if data.is_err() {
404 return; }
406 let specs: Vec<SpecEntry> = serde_json::from_str(&data.unwrap()).unwrap();
407 let mut names: Vec<&str> = specs.iter().map(|s| s.name.as_str()).collect();
408 names.sort();
409 let before = names.len();
410 names.dedup();
411 assert_eq!(
412 names.len(),
413 before,
414 "Duplicate names in data/w3c_specs.json"
415 );
416
417 let mut urls: Vec<&str> = specs.iter().map(|s| s.base_url.as_str()).collect();
418 urls.sort();
419 let before = urls.len();
420 urls.dedup();
421 assert_eq!(
422 urls.len(),
423 before,
424 "Duplicate base_urls in data/w3c_specs.json"
425 );
426 }
427
428 #[test]
429 fn test_generated_list_all_https() {
430 let data = std::fs::read_to_string("data/w3c_specs.json");
431 if data.is_err() {
432 return;
433 }
434 let specs: Vec<SpecEntry> = serde_json::from_str(&data.unwrap()).unwrap();
435 for s in &specs {
436 assert!(
437 s.base_url.starts_with("https://"),
438 "Non-https URL in data/w3c_specs.json: {} -> {}",
439 s.name,
440 s.base_url
441 );
442 }
443 }
444}