Skip to main content

cortex_runtime/acquisition/
api_discovery.rs

1//! Known public API discovery.
2//!
3//! Ships a mapping of well-known domains to their public APIs.
4//! For known domains, tries the API for richer data. Unknown domains skip silently.
5
6use super::http_client::HttpClient;
7use serde_json::Value;
8use std::collections::HashMap;
9
10/// A record returned from a public API.
11#[derive(Debug, Clone)]
12pub struct ApiRecord {
13    /// The original URL this record corresponds to.
14    pub url: String,
15    /// The API endpoint that was queried.
16    pub api_url: String,
17    /// Raw JSON response from the API.
18    pub data: Value,
19}
20
21/// Known API configurations.
22struct KnownApi {
23    /// API URL template. Use `{path}` for the URL path portion.
24    api_template: &'static str,
25    /// Type of API: "rest" (full URL template) or "json_suffix" (append .json).
26    api_type: &'static str,
27}
28
29fn known_apis() -> HashMap<&'static str, KnownApi> {
30    let mut m = HashMap::new();
31    m.insert(
32        "en.wikipedia.org",
33        KnownApi {
34            api_template: "https://en.wikipedia.org/api/rest_v1/page/summary/{title}",
35            api_type: "rest",
36        },
37    );
38    m.insert(
39        "github.com",
40        KnownApi {
41            api_template: "https://api.github.com/repos/{owner}/{repo}",
42            api_type: "rest",
43        },
44    );
45    m.insert(
46        "reddit.com",
47        KnownApi {
48            api_template: "{url}.json",
49            api_type: "json_suffix",
50        },
51    );
52    m.insert(
53        "www.reddit.com",
54        KnownApi {
55            api_template: "{url}.json",
56            api_type: "json_suffix",
57        },
58    );
59    m.insert(
60        "www.npmjs.com",
61        KnownApi {
62            api_template: "https://registry.npmjs.org/{package}",
63            api_type: "rest",
64        },
65    );
66    m.insert(
67        "npmjs.com",
68        KnownApi {
69            api_template: "https://registry.npmjs.org/{package}",
70            api_type: "rest",
71        },
72    );
73    m.insert(
74        "pypi.org",
75        KnownApi {
76            api_template: "https://pypi.org/pypi/{package}/json",
77            api_type: "rest",
78        },
79    );
80    m.insert(
81        "crates.io",
82        KnownApi {
83            api_template: "https://crates.io/api/v1/crates/{crate_name}",
84            api_type: "rest",
85        },
86    );
87    m
88}
89
90/// Try to fetch data from a known public API for the given domain.
91///
92/// Returns `Some(records)` if the domain has a known API and data was fetched.
93/// Returns `None` for unknown domains (silent skip).
94pub async fn try_api(domain: &str, urls: &[String], client: &HttpClient) -> Option<Vec<ApiRecord>> {
95    let apis = known_apis();
96    let api = apis.get(domain)?;
97
98    let mut records = Vec::new();
99
100    for url in urls.iter().take(10) {
101        let api_url = match api.api_type {
102            "json_suffix" => format!("{url}.json"),
103            "rest" => build_rest_url(api.api_template, url, domain),
104            _ => continue,
105        };
106
107        if let Ok(resp) = client.get(&api_url, 5000).await {
108            if resp.status == 200 {
109                if let Ok(data) = serde_json::from_str::<Value>(&resp.body) {
110                    records.push(ApiRecord {
111                        url: url.clone(),
112                        api_url,
113                        data,
114                    });
115                }
116            }
117        }
118    }
119
120    if records.is_empty() {
121        None
122    } else {
123        Some(records)
124    }
125}
126
127/// Check if a domain has a known API (without making any requests).
128pub fn has_known_api(domain: &str) -> bool {
129    known_apis().contains_key(domain)
130}
131
132fn build_rest_url(template: &str, url: &str, domain: &str) -> String {
133    let path = url
134        .strip_prefix(&format!("https://{domain}"))
135        .or_else(|| url.strip_prefix(&format!("http://{domain}")))
136        .unwrap_or("");
137
138    let parts: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
139
140    let mut result = template.to_string();
141
142    // Wikipedia: /wiki/Title → {title}
143    if domain.contains("wikipedia.org") {
144        if let Some(title) = parts.get(1) {
145            result = result.replace("{title}", title);
146        } else {
147            return String::new();
148        }
149    }
150    // GitHub: /owner/repo → {owner}/{repo}
151    else if domain == "github.com" {
152        if parts.len() >= 2 {
153            result = result.replace("{owner}", parts[0]);
154            result = result.replace("{repo}", parts[1]);
155        } else {
156            return String::new();
157        }
158    }
159    // npm: /package/name → {package}
160    // PyPI: /project/name → {package}
161    else if domain.contains("npmjs.com") || domain == "pypi.org" {
162        if let Some(pkg) = parts.get(1).or(parts.first()) {
163            result = result.replace("{package}", pkg);
164        } else {
165            return String::new();
166        }
167    }
168    // crates.io: /crates/name → {crate_name}
169    else if domain == "crates.io" {
170        if let Some(crate_name) = parts.get(1).or(parts.first()) {
171            result = result.replace("{crate_name}", crate_name);
172        } else {
173            return String::new();
174        }
175    }
176    // Reddit: append .json
177    else if domain.contains("reddit.com") {
178        result = result.replace("{url}", url);
179    }
180
181    result
182}
183
184#[cfg(test)]
185mod tests {
186    use super::*;
187
188    #[test]
189    fn test_has_known_api() {
190        assert!(has_known_api("en.wikipedia.org"));
191        assert!(has_known_api("github.com"));
192        assert!(has_known_api("crates.io"));
193        assert!(!has_known_api("example.com"));
194        assert!(!has_known_api("google.com"));
195    }
196
197    #[test]
198    fn test_build_rest_url_github() {
199        let url = build_rest_url(
200            "https://api.github.com/repos/{owner}/{repo}",
201            "https://github.com/cortex-ai/cortex",
202            "github.com",
203        );
204        assert_eq!(url, "https://api.github.com/repos/cortex-ai/cortex");
205    }
206
207    #[test]
208    fn test_build_rest_url_wikipedia() {
209        let url = build_rest_url(
210            "https://en.wikipedia.org/api/rest_v1/page/summary/{title}",
211            "https://en.wikipedia.org/wiki/Rust_(programming_language)",
212            "en.wikipedia.org",
213        );
214        assert_eq!(
215            url,
216            "https://en.wikipedia.org/api/rest_v1/page/summary/Rust_(programming_language)"
217        );
218    }
219
220    #[test]
221    fn test_build_rest_url_npm() {
222        let url = build_rest_url(
223            "https://registry.npmjs.org/{package}",
224            "https://www.npmjs.com/package/express",
225            "www.npmjs.com",
226        );
227        assert_eq!(url, "https://registry.npmjs.org/express");
228    }
229
230    #[test]
231    fn test_build_rest_url_crates() {
232        let url = build_rest_url(
233            "https://crates.io/api/v1/crates/{crate_name}",
234            "https://crates.io/crates/serde",
235            "crates.io",
236        );
237        assert_eq!(url, "https://crates.io/api/v1/crates/serde");
238    }
239}