cortex_runtime/acquisition/
api_discovery.rs1use super::http_client::HttpClient;
7use serde_json::Value;
8use std::collections::HashMap;
9
10#[derive(Debug, Clone)]
12pub struct ApiRecord {
13 pub url: String,
15 pub api_url: String,
17 pub data: Value,
19}
20
21struct KnownApi {
23 api_template: &'static str,
25 api_type: &'static str,
27}
28
29fn known_apis() -> HashMap<&'static str, KnownApi> {
30 let mut m = HashMap::new();
31 m.insert(
32 "en.wikipedia.org",
33 KnownApi {
34 api_template: "https://en.wikipedia.org/api/rest_v1/page/summary/{title}",
35 api_type: "rest",
36 },
37 );
38 m.insert(
39 "github.com",
40 KnownApi {
41 api_template: "https://api.github.com/repos/{owner}/{repo}",
42 api_type: "rest",
43 },
44 );
45 m.insert(
46 "reddit.com",
47 KnownApi {
48 api_template: "{url}.json",
49 api_type: "json_suffix",
50 },
51 );
52 m.insert(
53 "www.reddit.com",
54 KnownApi {
55 api_template: "{url}.json",
56 api_type: "json_suffix",
57 },
58 );
59 m.insert(
60 "www.npmjs.com",
61 KnownApi {
62 api_template: "https://registry.npmjs.org/{package}",
63 api_type: "rest",
64 },
65 );
66 m.insert(
67 "npmjs.com",
68 KnownApi {
69 api_template: "https://registry.npmjs.org/{package}",
70 api_type: "rest",
71 },
72 );
73 m.insert(
74 "pypi.org",
75 KnownApi {
76 api_template: "https://pypi.org/pypi/{package}/json",
77 api_type: "rest",
78 },
79 );
80 m.insert(
81 "crates.io",
82 KnownApi {
83 api_template: "https://crates.io/api/v1/crates/{crate_name}",
84 api_type: "rest",
85 },
86 );
87 m
88}
89
90pub async fn try_api(domain: &str, urls: &[String], client: &HttpClient) -> Option<Vec<ApiRecord>> {
95 let apis = known_apis();
96 let api = apis.get(domain)?;
97
98 let mut records = Vec::new();
99
100 for url in urls.iter().take(10) {
101 let api_url = match api.api_type {
102 "json_suffix" => format!("{url}.json"),
103 "rest" => build_rest_url(api.api_template, url, domain),
104 _ => continue,
105 };
106
107 if let Ok(resp) = client.get(&api_url, 5000).await {
108 if resp.status == 200 {
109 if let Ok(data) = serde_json::from_str::<Value>(&resp.body) {
110 records.push(ApiRecord {
111 url: url.clone(),
112 api_url,
113 data,
114 });
115 }
116 }
117 }
118 }
119
120 if records.is_empty() {
121 None
122 } else {
123 Some(records)
124 }
125}
126
127pub fn has_known_api(domain: &str) -> bool {
129 known_apis().contains_key(domain)
130}
131
132fn build_rest_url(template: &str, url: &str, domain: &str) -> String {
133 let path = url
134 .strip_prefix(&format!("https://{domain}"))
135 .or_else(|| url.strip_prefix(&format!("http://{domain}")))
136 .unwrap_or("");
137
138 let parts: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
139
140 let mut result = template.to_string();
141
142 if domain.contains("wikipedia.org") {
144 if let Some(title) = parts.get(1) {
145 result = result.replace("{title}", title);
146 } else {
147 return String::new();
148 }
149 }
150 else if domain == "github.com" {
152 if parts.len() >= 2 {
153 result = result.replace("{owner}", parts[0]);
154 result = result.replace("{repo}", parts[1]);
155 } else {
156 return String::new();
157 }
158 }
159 else if domain.contains("npmjs.com") || domain == "pypi.org" {
162 if let Some(pkg) = parts.get(1).or(parts.first()) {
163 result = result.replace("{package}", pkg);
164 } else {
165 return String::new();
166 }
167 }
168 else if domain == "crates.io" {
170 if let Some(crate_name) = parts.get(1).or(parts.first()) {
171 result = result.replace("{crate_name}", crate_name);
172 } else {
173 return String::new();
174 }
175 }
176 else if domain.contains("reddit.com") {
178 result = result.replace("{url}", url);
179 }
180
181 result
182}
183
184#[cfg(test)]
185mod tests {
186 use super::*;
187
188 #[test]
189 fn test_has_known_api() {
190 assert!(has_known_api("en.wikipedia.org"));
191 assert!(has_known_api("github.com"));
192 assert!(has_known_api("crates.io"));
193 assert!(!has_known_api("example.com"));
194 assert!(!has_known_api("google.com"));
195 }
196
197 #[test]
198 fn test_build_rest_url_github() {
199 let url = build_rest_url(
200 "https://api.github.com/repos/{owner}/{repo}",
201 "https://github.com/cortex-ai/cortex",
202 "github.com",
203 );
204 assert_eq!(url, "https://api.github.com/repos/cortex-ai/cortex");
205 }
206
207 #[test]
208 fn test_build_rest_url_wikipedia() {
209 let url = build_rest_url(
210 "https://en.wikipedia.org/api/rest_v1/page/summary/{title}",
211 "https://en.wikipedia.org/wiki/Rust_(programming_language)",
212 "en.wikipedia.org",
213 );
214 assert_eq!(
215 url,
216 "https://en.wikipedia.org/api/rest_v1/page/summary/Rust_(programming_language)"
217 );
218 }
219
220 #[test]
221 fn test_build_rest_url_npm() {
222 let url = build_rest_url(
223 "https://registry.npmjs.org/{package}",
224 "https://www.npmjs.com/package/express",
225 "www.npmjs.com",
226 );
227 assert_eq!(url, "https://registry.npmjs.org/express");
228 }
229
230 #[test]
231 fn test_build_rest_url_crates() {
232 let url = build_rest_url(
233 "https://crates.io/api/v1/crates/{crate_name}",
234 "https://crates.io/crates/serde",
235 "crates.io",
236 );
237 assert_eq!(url, "https://crates.io/api/v1/crates/serde");
238 }
239}