1use base64::engine::general_purpose::URL_SAFE_NO_PAD;
2use base64::Engine;
3use regex::Regex;
4use std::fmt;
5
6#[derive(Debug, Clone, PartialEq, Eq)]
7pub enum SourceUri {
8 Src(SourceSpec),
9 Artifact { sha256: String },
10}
11
12#[derive(Debug, Clone, PartialEq, Eq)]
13pub enum SourceSpec {
14 Local {
15 sha256: String,
16 },
17 Gh {
18 owner: String,
19 repo: String,
20 r#ref: String,
21 path: String,
22 },
23 Gl {
24 project: String,
25 r#ref: String,
26 path: String,
27 },
28 Git {
29 remote: String,
30 r#ref: String,
31 path: String,
32 },
33}
34
35#[derive(Debug, thiserror::Error, PartialEq, Eq)]
36pub enum SourceUriError {
37 #[error("uri must start with os://")]
38 InvalidScheme,
39 #[error("unsupported uri kind: {0}")]
40 UnsupportedKind(String),
41 #[error("invalid uri structure: {0}")]
42 InvalidStructure(String),
43 #[error("invalid sha256: {0}")]
44 InvalidHash(String),
45 #[error("invalid ref encoding: {0}")]
46 InvalidRefEncoding(String),
47 #[error("invalid path encoding: {0}")]
48 InvalidPathEncoding(String),
49 #[error("invalid base64url segment: {0}")]
50 InvalidBase64(String),
51}
52
53impl SourceUri {
54 pub fn parse(input: &str) -> Result<Self, SourceUriError> {
55 let body = input
56 .strip_prefix("os://")
57 .ok_or(SourceUriError::InvalidScheme)?;
58
59 if let Some(hash) = body.strip_prefix("artifact/") {
60 validate_sha256(hash)?;
61 return Ok(Self::Artifact {
62 sha256: hash.to_string(),
63 });
64 }
65
66 let segments = split_non_empty(body);
67 if segments.len() < 2 {
68 return Err(SourceUriError::InvalidStructure(
69 "expected os://src/<provider>/...".to_string(),
70 ));
71 }
72
73 if segments[0] != "src" {
74 return Err(SourceUriError::UnsupportedKind(segments[0].to_string()));
75 }
76
77 let provider = segments[1];
78 let rest = &segments[2..];
79 match provider {
80 "local" => parse_local(rest),
81 "gh" => parse_gh(rest),
82 "gl" => parse_gl(rest),
83 "git" => parse_git(rest),
84 other => Err(SourceUriError::UnsupportedKind(other.to_string())),
85 }
86 }
87
88 pub fn is_remote_source(&self) -> bool {
89 matches!(
90 self,
91 Self::Src(SourceSpec::Gh { .. })
92 | Self::Src(SourceSpec::Gl { .. })
93 | Self::Src(SourceSpec::Git { .. })
94 )
95 }
96
97 pub fn as_local_hash(&self) -> Option<&str> {
98 match self {
99 Self::Src(SourceSpec::Local { sha256 }) => Some(sha256),
100 _ => None,
101 }
102 }
103
104 pub fn as_artifact_hash(&self) -> Option<&str> {
105 match self {
106 Self::Artifact { sha256 } => Some(sha256),
107 _ => None,
108 }
109 }
110
111 pub fn to_web_path(&self) -> Option<String> {
112 match self {
113 Self::Src(SourceSpec::Gh {
114 owner,
115 repo,
116 r#ref,
117 path,
118 }) => Some(format!(
119 "/src/gh/{owner}/{repo}/ref/{}/path/{}",
120 encode_ref(r#ref),
121 encode_path(path)
122 )),
123 Self::Src(SourceSpec::Gl {
124 project,
125 r#ref,
126 path,
127 }) => Some(format!(
128 "/src/gl/{}/ref/{}/path/{}",
129 encode_b64(project),
130 encode_ref(r#ref),
131 encode_path(path)
132 )),
133 Self::Src(SourceSpec::Git {
134 remote,
135 r#ref,
136 path,
137 }) => Some(format!(
138 "/src/git/{}/ref/{}/path/{}",
139 encode_b64(remote),
140 encode_ref(r#ref),
141 encode_path(path)
142 )),
143 _ => None,
144 }
145 }
146}
147
148impl fmt::Display for SourceUri {
149 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
150 match self {
151 Self::Artifact { sha256 } => write!(f, "os://artifact/{sha256}"),
152 Self::Src(SourceSpec::Local { sha256 }) => write!(f, "os://src/local/{sha256}"),
153 Self::Src(SourceSpec::Gh {
154 owner,
155 repo,
156 r#ref,
157 path,
158 }) => write!(
159 f,
160 "os://src/gh/{owner}/{repo}/ref/{}/path/{}",
161 encode_ref(r#ref),
162 encode_path(path)
163 ),
164 Self::Src(SourceSpec::Gl {
165 project,
166 r#ref,
167 path,
168 }) => write!(
169 f,
170 "os://src/gl/{}/ref/{}/path/{}",
171 encode_b64(project),
172 encode_ref(r#ref),
173 encode_path(path)
174 ),
175 Self::Src(SourceSpec::Git {
176 remote,
177 r#ref,
178 path,
179 }) => write!(
180 f,
181 "os://src/git/{}/ref/{}/path/{}",
182 encode_b64(remote),
183 encode_ref(r#ref),
184 encode_path(path)
185 ),
186 }
187 }
188}
189
190fn parse_local(rest: &[&str]) -> Result<SourceUri, SourceUriError> {
191 if rest.len() != 1 {
192 return Err(SourceUriError::InvalidStructure(
193 "local uri must be os://src/local/<sha256>".to_string(),
194 ));
195 }
196 let hash = rest[0];
197 validate_sha256(hash)?;
198 Ok(SourceUri::Src(SourceSpec::Local {
199 sha256: hash.to_string(),
200 }))
201}
202
203fn parse_gh(rest: &[&str]) -> Result<SourceUri, SourceUriError> {
204 if rest.len() < 6 {
205 return Err(SourceUriError::InvalidStructure(
206 "gh uri must be os://src/gh/<owner>/<repo>/ref/<ref>/path/<path...>".to_string(),
207 ));
208 }
209 if rest[2] != "ref" || rest[4] != "path" {
210 return Err(SourceUriError::InvalidStructure(
211 "gh uri must contain /ref/<ref>/path/<path...>".to_string(),
212 ));
213 }
214 validate_owner_repo(rest[0])?;
215 validate_owner_repo(rest[1])?;
216 let decoded_ref = decode_ref(rest[3])?;
217 let path = decode_path(&rest[5..])?;
218 Ok(SourceUri::Src(SourceSpec::Gh {
219 owner: rest[0].to_string(),
220 repo: rest[1].to_string(),
221 r#ref: decoded_ref,
222 path,
223 }))
224}
225
226fn parse_gl(rest: &[&str]) -> Result<SourceUri, SourceUriError> {
227 if rest.len() < 5 {
228 return Err(SourceUriError::InvalidStructure(
229 "gl uri must be os://src/gl/<project_b64>/ref/<ref>/path/<path...>".to_string(),
230 ));
231 }
232 if rest[1] != "ref" || rest[3] != "path" {
233 return Err(SourceUriError::InvalidStructure(
234 "gl uri must contain /ref/<ref>/path/<path...>".to_string(),
235 ));
236 }
237 let project = decode_b64(rest[0])?;
238 let decoded_ref = decode_ref(rest[2])?;
239 let path = decode_path(&rest[4..])?;
240 Ok(SourceUri::Src(SourceSpec::Gl {
241 project,
242 r#ref: decoded_ref,
243 path,
244 }))
245}
246
247fn parse_git(rest: &[&str]) -> Result<SourceUri, SourceUriError> {
248 if rest.len() < 5 {
249 return Err(SourceUriError::InvalidStructure(
250 "git uri must be os://src/git/<remote_b64>/ref/<ref>/path/<path...>".to_string(),
251 ));
252 }
253 if rest[1] != "ref" || rest[3] != "path" {
254 return Err(SourceUriError::InvalidStructure(
255 "git uri must contain /ref/<ref>/path/<path...>".to_string(),
256 ));
257 }
258 let remote = decode_b64(rest[0])?;
259 let decoded_ref = decode_ref(rest[2])?;
260 let path = decode_path(&rest[4..])?;
261 Ok(SourceUri::Src(SourceSpec::Git {
262 remote,
263 r#ref: decoded_ref,
264 path,
265 }))
266}
267
268fn validate_sha256(hash: &str) -> Result<(), SourceUriError> {
269 let is_hex = hash.len() == 64 && hash.bytes().all(|b| b.is_ascii_hexdigit());
270 if !is_hex {
271 return Err(SourceUriError::InvalidHash(hash.to_string()));
272 }
273 Ok(())
274}
275
276fn validate_owner_repo(value: &str) -> Result<(), SourceUriError> {
277 static OWNER_REPO_RE: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
278 Regex::new(r"^[A-Za-z0-9._-]{1,200}$").expect("owner/repo regex should compile")
279 });
280 if OWNER_REPO_RE.is_match(value) {
281 Ok(())
282 } else {
283 Err(SourceUriError::InvalidStructure(format!(
284 "invalid owner/repo segment: {value}"
285 )))
286 }
287}
288
289fn encode_ref(value: &str) -> String {
290 urlencoding::encode(value).into_owned()
291}
292
293fn decode_ref(encoded: &str) -> Result<String, SourceUriError> {
294 let decoded = urlencoding::decode(encoded)
295 .map_err(|_| SourceUriError::InvalidRefEncoding(encoded.to_string()))?;
296 let trimmed = decoded.trim();
297 if trimmed.is_empty() {
298 return Err(SourceUriError::InvalidRefEncoding(encoded.to_string()));
299 }
300 Ok(trimmed.to_string())
301}
302
303fn encode_path(path: &str) -> String {
304 path.split('/')
305 .map(|segment| urlencoding::encode(segment).into_owned())
306 .collect::<Vec<_>>()
307 .join("/")
308}
309
310fn decode_path(segments: &[&str]) -> Result<String, SourceUriError> {
311 if segments.is_empty() {
312 return Err(SourceUriError::InvalidStructure(
313 "path segment is required".to_string(),
314 ));
315 }
316
317 let mut out = Vec::with_capacity(segments.len());
318 for encoded in segments {
319 let decoded = urlencoding::decode(encoded)
320 .map_err(|_| SourceUriError::InvalidPathEncoding((*encoded).to_string()))?;
321 let segment = decoded.trim();
322 if segment.is_empty() || segment == "." || segment == ".." || segment.contains('\\') {
323 return Err(SourceUriError::InvalidPathEncoding((*encoded).to_string()));
324 }
325 out.push(segment.to_string());
326 }
327 Ok(out.join("/"))
328}
329
330fn encode_b64(value: &str) -> String {
331 URL_SAFE_NO_PAD.encode(value.as_bytes())
332}
333
334fn decode_b64(value: &str) -> Result<String, SourceUriError> {
335 let bytes = URL_SAFE_NO_PAD
336 .decode(value.as_bytes())
337 .map_err(|_| SourceUriError::InvalidBase64(value.to_string()))?;
338 String::from_utf8(bytes).map_err(|_| SourceUriError::InvalidBase64(value.to_string()))
339}
340
341fn split_non_empty(value: &str) -> Vec<&str> {
342 value
343 .split('/')
344 .filter(|segment| !segment.is_empty())
345 .collect()
346}
347
348#[cfg(test)]
349mod tests {
350 use super::{SourceSpec, SourceUri};
351
352 #[test]
353 fn parses_local_uri() {
354 let hash = "a".repeat(64);
355 let parsed = SourceUri::parse(&format!("os://src/local/{hash}")).expect("parse local");
356 assert_eq!(
357 parsed,
358 SourceUri::Src(SourceSpec::Local {
359 sha256: hash.clone()
360 })
361 );
362 assert_eq!(parsed.to_string(), format!("os://src/local/{hash}"));
363 }
364
365 #[test]
366 fn parses_gh_roundtrip() {
367 let uri = SourceUri::Src(SourceSpec::Gh {
368 owner: "hwisu".to_string(),
369 repo: "opensession".to_string(),
370 r#ref: "refs/heads/feature/x".to_string(),
371 path: "sessions/abc.jsonl".to_string(),
372 });
373 let rendered = uri.to_string();
374 let parsed = SourceUri::parse(&rendered).expect("parse gh");
375 assert_eq!(parsed, uri);
376 assert_eq!(
377 parsed.to_web_path().as_deref(),
378 Some(
379 "/src/gh/hwisu/opensession/ref/refs%2Fheads%2Ffeature%2Fx/path/sessions/abc.jsonl"
380 )
381 );
382 }
383
384 #[test]
385 fn parses_gl_roundtrip() {
386 let uri = SourceUri::Src(SourceSpec::Gl {
387 project: "group/sub/repo".to_string(),
388 r#ref: "main".to_string(),
389 path: "dir/session.hail.jsonl".to_string(),
390 });
391 let rendered = uri.to_string();
392 let parsed = SourceUri::parse(&rendered).expect("parse gl");
393 assert_eq!(parsed, uri);
394 }
395
396 #[test]
397 fn parses_git_roundtrip() {
398 let uri = SourceUri::Src(SourceSpec::Git {
399 remote: "https://example.com/a/b.git".to_string(),
400 r#ref: "refs/opensession/branches/bWFpbg".to_string(),
401 path: "sessions/hash.jsonl".to_string(),
402 });
403 let rendered = uri.to_string();
404 let parsed = SourceUri::parse(&rendered).expect("parse git");
405 assert_eq!(parsed, uri);
406 }
407
408 #[test]
409 fn parses_artifact_uri() {
410 let hash = "f".repeat(64);
411 let parsed = SourceUri::parse(&format!("os://artifact/{hash}")).expect("parse artifact");
412 assert_eq!(parsed.to_string(), format!("os://artifact/{hash}"));
413 }
414
415 #[test]
416 fn rejects_invalid_hash() {
417 let err = SourceUri::parse("os://src/local/not-a-hash").expect_err("invalid hash");
418 assert!(
419 err.to_string().contains("invalid sha256"),
420 "unexpected error: {err}"
421 );
422 }
423}