use std::collections::{HashMap, HashSet};
use std::path::Path;
use anyhow::{Context, Result};
use futures_util::stream::StreamExt;
use lintel_schema_cache::SchemaCache;
use tracing::{debug, error, info, warn};
use crate::download::ProcessedSchemas;
pub struct RefRewriteContext<'a> {
pub cache: &'a SchemaCache,
pub shared_dir: &'a Path,
pub base_url_for_shared: &'a str,
pub already_downloaded: &'a mut HashMap<String, String>,
pub source_url: Option<String>,
pub processed: &'a ProcessedSchemas,
pub lintel_source: Option<(String, String)>,
pub local_source_dir: Option<&'a Path>,
pub sibling_urls: HashMap<String, String>,
pub file_match: Vec<String>,
pub parsers: Vec<schema_catalog::FileFormat>,
}
pub fn find_external_refs(value: &serde_json::Value) -> HashSet<String> {
let mut refs = HashSet::new();
collect_refs(value, &mut refs);
refs
}
pub fn find_relative_refs(value: &serde_json::Value) -> HashSet<String> {
let mut refs = HashSet::new();
collect_relative_refs(value, &mut refs);
refs
}
fn collect_refs(value: &serde_json::Value, refs: &mut HashSet<String>) {
match value {
serde_json::Value::Object(map) => {
if let Some(serde_json::Value::String(ref_str)) = map.get("$ref")
&& (ref_str.starts_with("http://") || ref_str.starts_with("https://"))
{
let base = ref_str.split('#').next().unwrap_or(ref_str);
if !base.is_empty() {
refs.insert(base.to_string());
}
}
for v in map.values() {
collect_refs(v, refs);
}
}
serde_json::Value::Array(arr) => {
for v in arr {
collect_refs(v, refs);
}
}
_ => {}
}
}
fn collect_relative_refs(value: &serde_json::Value, refs: &mut HashSet<String>) {
match value {
serde_json::Value::Object(map) => {
if let Some(serde_json::Value::String(ref_str)) = map.get("$ref") {
let base = ref_str.split('#').next().unwrap_or(ref_str);
if !base.is_empty() && !base.starts_with("http://") && !base.starts_with("https://")
{
refs.insert(base.to_string());
}
}
for v in map.values() {
collect_relative_refs(v, refs);
}
}
serde_json::Value::Array(arr) => {
for v in arr {
collect_relative_refs(v, refs);
}
}
_ => {}
}
}
fn resolve_relative_url(relative: &str, base_url: &str) -> Result<String> {
let base =
url::Url::parse(base_url).with_context(|| format!("invalid base URL: {base_url}"))?;
let resolved = base
.join(relative)
.with_context(|| format!("failed to resolve '{relative}' against '{base_url}'"))?;
Ok(resolved.to_string())
}
pub fn filename_from_url(url: &str) -> Result<String> {
let parsed = url::Url::parse(url).with_context(|| format!("invalid URL: {url}"))?;
let segments: Vec<&str> = parsed
.path_segments()
.map(Iterator::collect)
.unwrap_or_default();
if let Some(last) = segments.last().filter(|s| !s.is_empty()) {
let name = (*last).to_string();
if std::path::Path::new(&name)
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("json"))
{
return Ok(name);
}
return Ok(format!("{name}.json"));
}
let host = parsed
.host_str()
.with_context(|| format!("URL has no host: {url}"))?;
Ok(format!("{host}.json"))
}
fn unique_filename_in(dir: &Path, base: &str) -> String {
if !dir.join(base).exists() {
return base.to_string();
}
let (stem, ext) = match base.rfind('.') {
Some(pos) => (&base[..pos], &base[pos..]),
None => (base, ""),
};
let mut n = 2u32;
loop {
let candidate = format!("{stem}-{n}{ext}");
if !dir.join(&candidate).exists() {
return candidate;
}
n += 1;
}
}
pub fn rewrite_refs(value: &mut serde_json::Value, url_map: &HashMap<String, String>) {
match value {
serde_json::Value::Object(map) => {
if let Some(serde_json::Value::String(ref_str)) = map.get("$ref") {
let (base, fragment) = match ref_str.split_once('#') {
Some((b, f)) => (b, Some(f)),
None => (ref_str.as_str(), None),
};
if let Some(new_base) = url_map.get(base) {
let new_ref = match fragment {
Some(f) => format!("{new_base}#{f}"),
None => new_base.clone(),
};
map.insert("$ref".to_string(), serde_json::Value::String(new_ref));
}
}
for v in map.values_mut() {
rewrite_refs(v, url_map);
}
}
serde_json::Value::Array(arr) => {
for v in arr {
rewrite_refs(v, url_map);
}
}
_ => {}
}
}
fn resolve_all_relative_refs(
value: &serde_json::Value,
source_url: Option<&str>,
) -> HashMap<String, String> {
let relative_refs = find_relative_refs(value);
let mut resolved: HashMap<String, String> = HashMap::new();
if let Some(source_url) = source_url {
for rel_ref in &relative_refs {
match resolve_relative_url(rel_ref, source_url) {
Ok(abs_url) => {
debug!(relative = %rel_ref, resolved = %abs_url, "resolved relative $ref");
resolved.insert(rel_ref.clone(), abs_url);
}
Err(e) => {
warn!(relative = %rel_ref, error = %e, "failed to resolve relative $ref");
}
}
}
} else if !relative_refs.is_empty() {
debug!(
count = relative_refs.len(),
"skipping relative $ref resolution (no source URL)"
);
}
resolved
}
fn postprocess_ctx<'a>(ctx: &RefRewriteContext<'a>) -> crate::postprocess::PostprocessContext<'a> {
crate::postprocess::PostprocessContext {
cache: ctx.cache,
source_url: ctx.source_url.clone(),
lintel_source: ctx.lintel_source.clone(),
file_match: ctx.file_match.clone(),
parsers: ctx.parsers.clone(),
}
}
pub async fn resolve_and_rewrite(
ctx: &mut RefRewriteContext<'_>,
schema_text: &str,
schema_dest: &Path,
schema_url: &str,
) -> Result<()> {
let mut value: serde_json::Value =
serde_json::from_str(schema_text).context("failed to parse schema JSON")?;
resolve_and_rewrite_value(ctx, &mut value, schema_dest, schema_url).await
}
pub async fn resolve_and_rewrite_value(
ctx: &mut RefRewriteContext<'_>,
value: &mut serde_json::Value,
schema_dest: &Path,
schema_url: &str,
) -> Result<()> {
value
.as_object_mut()
.context("schema root must be an object")?
.insert(
"$id".to_string(),
serde_json::Value::String(schema_url.to_string()),
);
jsonschema_migrate::migrate_to_2020_12(value);
if let Err(e) = serde_json::from_value::<jsonschema_migrate::Schema>(value.clone()) {
error!(url = %schema_url, error = %e, "schema failed to deserialize after migration");
}
let external_refs = find_external_refs(value);
let relative_refs = find_relative_refs(value);
let mut sibling_map: HashMap<String, String> = HashMap::new();
let mut unresolved_relative: HashSet<String> = HashSet::new();
for rel in &relative_refs {
if let Some(canonical) = ctx.sibling_urls.get(rel.as_str()) {
debug!(relative = %rel, resolved = %canonical, "resolved sibling $ref");
sibling_map.insert(rel.clone(), canonical.clone());
} else {
unresolved_relative.insert(rel.clone());
}
}
if !sibling_map.is_empty() {
rewrite_refs(value, &sibling_map);
}
let resolved_relative = if unresolved_relative.is_empty() {
HashMap::new()
} else {
let all_resolved = resolve_all_relative_refs(value, ctx.source_url.as_deref());
all_resolved
.into_iter()
.filter(|(rel, _)| unresolved_relative.contains(rel))
.collect::<HashMap<_, _>>()
};
if external_refs.is_empty() && resolved_relative.is_empty() {
crate::postprocess::postprocess_schema(&postprocess_ctx(ctx), value);
crate::download::write_schema_json(value, schema_dest, ctx.processed).await?;
return Ok(());
}
debug!(
external = external_refs.len(),
relative = resolved_relative.len(),
"found $ref dependencies"
);
let pending: Vec<(String, String, Option<String>)> = external_refs
.iter()
.map(|url| (url.clone(), url.clone(), Some(url.clone())))
.chain(
resolved_relative
.iter()
.map(|(rel, abs)| (rel.clone(), abs.clone(), Some(abs.clone()))),
)
.collect();
let parent_stem = schema_dest
.file_stem()
.unwrap_or_default()
.to_string_lossy()
.to_string();
let (url_map, dep_values) = fetch_refs_queued(ctx, pending, &parent_stem).await?;
rewrite_refs(value, &url_map);
crate::postprocess::postprocess_schema(&postprocess_ctx(ctx), value);
crate::download::write_schema_json(value, schema_dest, ctx.processed).await?;
write_dep_schemas(ctx, dep_values, &url_map).await?;
Ok(())
}
fn is_relative_path(ref_key: &str) -> bool {
!ref_key.is_empty()
&& !ref_key.starts_with("http://")
&& !ref_key.starts_with("https://")
&& !ref_key.starts_with('#')
}
fn enqueue_transitive_refs(
dep_value: &serde_json::Value,
source_url: Option<&str>,
already_downloaded: &HashMap<String, String>,
pending: &mut Vec<(String, String, Option<String>)>,
) {
for url in find_external_refs(dep_value) {
if !already_downloaded.contains_key(&url) {
pending.push((url.clone(), url.clone(), Some(url.clone())));
}
}
for (rel, abs) in resolve_all_relative_refs(dep_value, source_url) {
if !already_downloaded.contains_key(&abs) {
pending.push((rel, abs.clone(), Some(abs)));
}
}
}
async fn fetch_refs_queued(
ctx: &mut RefRewriteContext<'_>,
initial: Vec<(String, String, Option<String>)>,
parent_stem: &str,
) -> Result<(
HashMap<String, String>,
Vec<(String, serde_json::Value, Option<String>)>,
)> {
let mut url_map: HashMap<String, String> = HashMap::new();
let mut dep_values: Vec<(String, serde_json::Value, Option<String>)> = Vec::new();
let mut pending: Vec<(String, String, Option<String>)> = initial;
let mut in_flight = futures_util::stream::FuturesUnordered::new();
let mut shared_dir_created = false;
loop {
while let Some((ref_key, download_url, source_url)) = pending.pop() {
if let Some(existing_filename) = ctx.already_downloaded.get(&download_url) {
let local_url = format!(
"{}/{}",
ctx.base_url_for_shared.trim_end_matches('/'),
existing_filename,
);
url_map.insert(ref_key, local_url);
continue;
}
if !shared_dir_created {
tokio::fs::create_dir_all(ctx.shared_dir).await?;
shared_dir_created = true;
}
let dep_basename = filename_from_url(&download_url)?;
let base_filename = format!("{parent_stem}--{dep_basename}");
let filename = unique_filename_in(ctx.shared_dir, &base_filename);
ctx.already_downloaded
.insert(download_url.clone(), filename.clone());
let local_url = format!(
"{}/{}",
ctx.base_url_for_shared.trim_end_matches('/'),
filename,
);
url_map.insert(ref_key.clone(), local_url.clone());
if let Some(local_dir) = ctx.local_source_dir
&& is_relative_path(&ref_key)
{
let local_path = local_dir.join(&ref_key);
if let Ok(text) = tokio::fs::read_to_string(&local_path).await {
match serde_json::from_str::<serde_json::Value>(&text) {
Ok(dep_value) => {
info!(path = %local_path.display(), "read local $ref dependency");
enqueue_transitive_refs(
&dep_value,
source_url.as_deref(),
ctx.already_downloaded,
&mut pending,
);
dep_values.push((filename, dep_value, source_url));
continue;
}
Err(e) => {
debug!(
path = %local_path.display(),
error = %e,
"local file not valid JSON, falling back to HTTP"
);
}
}
}
}
let cache = ctx.cache.clone();
in_flight.push(async move {
let result = crate::download::fetch_one(&cache, &download_url).await;
(download_url, filename, local_url, source_url, result)
});
}
if in_flight.is_empty() {
break;
}
let Some((download_url, filename, local_url, source_url, result)) = in_flight.next().await
else {
break;
};
match result {
Ok((dep_value, status)) => {
info!(url = %download_url, status = %status, "downloaded $ref dependency");
enqueue_transitive_refs(
&dep_value,
source_url.as_deref(),
ctx.already_downloaded,
&mut pending,
);
dep_values.push((filename, dep_value, source_url));
}
Err(e) => {
warn!(url = %download_url, error = %e, "failed to download $ref dependency, keeping original URL");
ctx.already_downloaded.remove(&download_url);
url_map.retain(|_, v| v != &local_url);
}
}
}
Ok((url_map, dep_values))
}
async fn write_dep_schemas(
ctx: &RefRewriteContext<'_>,
dep_values: Vec<(String, serde_json::Value, Option<String>)>,
url_map: &HashMap<String, String>,
) -> Result<()> {
for (filename, mut dep_value, source_url) in dep_values {
let dep_dest = ctx.shared_dir.join(&filename);
let dep_local_url = format!(
"{}/{}",
ctx.base_url_for_shared.trim_end_matches('/'),
filename,
);
let dep_relative = resolve_all_relative_refs(&dep_value, source_url.as_deref());
let mut dep_url_map = url_map.clone();
for (rel, abs) in &dep_relative {
if let Some(existing_filename) = ctx.already_downloaded.get(abs) {
let local_url = format!(
"{}/{}",
ctx.base_url_for_shared.trim_end_matches('/'),
existing_filename,
);
dep_url_map.insert(rel.clone(), local_url);
}
}
if let Some(obj) = dep_value.as_object_mut() {
obj.insert("$id".to_string(), serde_json::Value::String(dep_local_url));
}
jsonschema_migrate::migrate_to_2020_12(&mut dep_value);
if let Err(e) = serde_json::from_value::<jsonschema_migrate::Schema>(dep_value.clone()) {
let dep_url = source_url.as_deref().unwrap_or(&filename);
error!(url = %dep_url, error = %e, "dependency schema failed to deserialize after migration");
}
rewrite_refs(&mut dep_value, &dep_url_map);
crate::postprocess::postprocess_schema(
&crate::postprocess::PostprocessContext {
cache: ctx.cache,
source_url,
lintel_source: None,
file_match: Vec::new(),
parsers: Vec::new(),
},
&mut dep_value,
);
crate::download::write_schema_json(&dep_value, &dep_dest, ctx.processed).await?;
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn find_refs_in_simple_schema() {
let schema = serde_json::json!({
"$ref": "https://example.com/base.json#/definitions/Foo",
"properties": {
"bar": { "$ref": "https://example.com/other.json" },
"local": { "$ref": "#/definitions/Local" }
}
});
let refs = find_external_refs(&schema);
assert_eq!(refs.len(), 2);
assert!(refs.contains("https://example.com/base.json"));
assert!(refs.contains("https://example.com/other.json"));
}
#[test]
fn find_refs_ignores_relative() {
let schema = serde_json::json!({
"$ref": "#/definitions/Local",
"items": { "$ref": "./local.json" }
});
let refs = find_external_refs(&schema);
assert!(refs.is_empty());
}
#[test]
fn find_refs_in_arrays() {
let schema = serde_json::json!({
"oneOf": [
{ "$ref": "https://a.com/one.json" },
{ "$ref": "https://b.com/two.json#/defs/X" }
]
});
let refs = find_external_refs(&schema);
assert_eq!(refs.len(), 2);
assert!(refs.contains("https://a.com/one.json"));
assert!(refs.contains("https://b.com/two.json"));
}
#[test]
fn filename_from_url_extracts_last_segment() {
assert_eq!(
filename_from_url("https://example.com/schemas/foo.json").expect("ok"),
"foo.json"
);
}
#[test]
fn filename_from_url_with_path() {
assert_eq!(
filename_from_url("https://example.com/a/b/c/my-schema.json").expect("ok"),
"my-schema.json"
);
}
#[test]
fn filename_from_url_appends_json_extension() {
assert_eq!(
filename_from_url("https://example.com/version/1").expect("ok"),
"1.json"
);
assert_eq!(
filename_from_url("https://example.com/schemas/feed-1").expect("ok"),
"feed-1.json"
);
}
#[test]
fn rewrite_refs_replaces_mapped_urls() {
let mut schema = serde_json::json!({
"$ref": "https://example.com/base.json#/definitions/Foo",
"properties": {
"bar": { "$ref": "https://example.com/other.json" },
"local": { "$ref": "#/definitions/Local" }
}
});
let url_map: HashMap<String, String> = [
(
"https://example.com/base.json".to_string(),
"_shared/base.json".to_string(),
),
(
"https://example.com/other.json".to_string(),
"_shared/other.json".to_string(),
),
]
.into_iter()
.collect();
rewrite_refs(&mut schema, &url_map);
assert_eq!(schema["$ref"], "_shared/base.json#/definitions/Foo");
assert_eq!(schema["properties"]["bar"]["$ref"], "_shared/other.json");
assert_eq!(schema["properties"]["local"]["$ref"], "#/definitions/Local");
}
#[test]
fn find_relative_refs_dot_slash() {
let schema = serde_json::json!({
"properties": {
"rule": { "$ref": "./rule.json#/$defs/SerializableRule" }
}
});
let refs = find_relative_refs(&schema);
assert_eq!(refs.len(), 1);
assert!(refs.contains("./rule.json"));
}
#[test]
fn find_relative_refs_ignores_fragment_only() {
let schema = serde_json::json!({
"$ref": "#/definitions/Foo",
"items": { "$ref": "#/$defs/Bar" }
});
let refs = find_relative_refs(&schema);
assert!(refs.is_empty());
}
#[test]
fn find_relative_refs_ignores_http() {
let schema = serde_json::json!({
"$ref": "https://example.com/schema.json"
});
let refs = find_relative_refs(&schema);
assert!(refs.is_empty());
}
#[test]
fn find_relative_refs_various_patterns() {
let schema = serde_json::json!({
"oneOf": [
{ "$ref": "./a.json" },
{ "$ref": "../b.json#/defs/X" },
{ "$ref": "subdir/c.json" }
]
});
let refs = find_relative_refs(&schema);
assert_eq!(refs.len(), 3);
assert!(refs.contains("./a.json"));
assert!(refs.contains("../b.json"));
assert!(refs.contains("subdir/c.json"));
}
#[test]
fn resolve_relative_dot_slash() {
let result = resolve_relative_url(
"./rule.json",
"https://raw.githubusercontent.com/ast-grep/ast-grep/main/schemas/project.json",
)
.expect("ok");
assert_eq!(
result,
"https://raw.githubusercontent.com/ast-grep/ast-grep/main/schemas/rule.json"
);
}
#[test]
fn resolve_relative_parent_dir() {
let result = resolve_relative_url(
"../other/schema.json",
"https://example.com/schemas/sub/main.json",
)
.expect("ok");
assert_eq!(result, "https://example.com/schemas/other/schema.json");
}
#[test]
fn resolve_relative_bare_filename() {
let result = resolve_relative_url("types.json", "https://example.com/schemas/main.json")
.expect("ok");
assert_eq!(result, "https://example.com/schemas/types.json");
}
#[test]
fn rewrite_refs_replaces_relative_refs() {
let mut schema = serde_json::json!({
"properties": {
"rule": { "$ref": "./rule.json#/$defs/SerializableRule" },
"local": { "$ref": "#/definitions/Local" }
}
});
let url_map: HashMap<String, String> = [(
"./rule.json".to_string(),
"_shared/project--rule.json".to_string(),
)]
.into_iter()
.collect();
rewrite_refs(&mut schema, &url_map);
assert_eq!(
schema["properties"]["rule"]["$ref"],
"_shared/project--rule.json#/$defs/SerializableRule"
);
assert_eq!(schema["properties"]["local"]["$ref"], "#/definitions/Local");
}
}