Skip to main content

opensession_core/
source_uri.rs

1use base64::engine::general_purpose::URL_SAFE_NO_PAD;
2use base64::Engine;
3use regex::Regex;
4use std::fmt;
5
6#[derive(Debug, Clone, PartialEq, Eq)]
7pub enum SourceUri {
8    Src(SourceSpec),
9    Artifact { sha256: String },
10}
11
12#[derive(Debug, Clone, PartialEq, Eq)]
13pub enum SourceSpec {
14    Local {
15        sha256: String,
16    },
17    Gh {
18        owner: String,
19        repo: String,
20        r#ref: String,
21        path: String,
22    },
23    Gl {
24        project: String,
25        r#ref: String,
26        path: String,
27    },
28    Git {
29        remote: String,
30        r#ref: String,
31        path: String,
32    },
33}
34
35#[derive(Debug, thiserror::Error, PartialEq, Eq)]
36pub enum SourceUriError {
37    #[error("uri must start with os://")]
38    InvalidScheme,
39    #[error("unsupported uri kind: {0}")]
40    UnsupportedKind(String),
41    #[error("invalid uri structure: {0}")]
42    InvalidStructure(String),
43    #[error("invalid sha256: {0}")]
44    InvalidHash(String),
45    #[error("invalid ref encoding: {0}")]
46    InvalidRefEncoding(String),
47    #[error("invalid path encoding: {0}")]
48    InvalidPathEncoding(String),
49    #[error("invalid base64url segment: {0}")]
50    InvalidBase64(String),
51}
52
53impl SourceUri {
54    pub fn parse(input: &str) -> Result<Self, SourceUriError> {
55        let body = input
56            .strip_prefix("os://")
57            .ok_or(SourceUriError::InvalidScheme)?;
58
59        if let Some(hash) = body.strip_prefix("artifact/") {
60            validate_sha256(hash)?;
61            return Ok(Self::Artifact {
62                sha256: hash.to_string(),
63            });
64        }
65
66        let segments = split_non_empty(body);
67        if segments.len() < 2 {
68            return Err(SourceUriError::InvalidStructure(
69                "expected os://src/<provider>/...".to_string(),
70            ));
71        }
72
73        if segments[0] != "src" {
74            return Err(SourceUriError::UnsupportedKind(segments[0].to_string()));
75        }
76
77        let provider = segments[1];
78        let rest = &segments[2..];
79        match provider {
80            "local" => parse_local(rest),
81            "gh" => parse_gh(rest),
82            "gl" => parse_gl(rest),
83            "git" => parse_git(rest),
84            other => Err(SourceUriError::UnsupportedKind(other.to_string())),
85        }
86    }
87
88    pub fn is_remote_source(&self) -> bool {
89        matches!(
90            self,
91            Self::Src(SourceSpec::Gh { .. })
92                | Self::Src(SourceSpec::Gl { .. })
93                | Self::Src(SourceSpec::Git { .. })
94        )
95    }
96
97    pub fn as_local_hash(&self) -> Option<&str> {
98        match self {
99            Self::Src(SourceSpec::Local { sha256 }) => Some(sha256),
100            _ => None,
101        }
102    }
103
104    pub fn as_artifact_hash(&self) -> Option<&str> {
105        match self {
106            Self::Artifact { sha256 } => Some(sha256),
107            _ => None,
108        }
109    }
110
111    pub fn to_web_path(&self) -> Option<String> {
112        match self {
113            Self::Src(SourceSpec::Gh {
114                owner,
115                repo,
116                r#ref,
117                path,
118            }) => Some(format!(
119                "/src/gh/{owner}/{repo}/ref/{}/path/{}",
120                encode_ref(r#ref),
121                encode_path(path)
122            )),
123            Self::Src(SourceSpec::Gl {
124                project,
125                r#ref,
126                path,
127            }) => Some(format!(
128                "/src/gl/{}/ref/{}/path/{}",
129                encode_b64(project),
130                encode_ref(r#ref),
131                encode_path(path)
132            )),
133            Self::Src(SourceSpec::Git {
134                remote,
135                r#ref,
136                path,
137            }) => Some(format!(
138                "/src/git/{}/ref/{}/path/{}",
139                encode_b64(remote),
140                encode_ref(r#ref),
141                encode_path(path)
142            )),
143            _ => None,
144        }
145    }
146}
147
148impl fmt::Display for SourceUri {
149    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
150        match self {
151            Self::Artifact { sha256 } => write!(f, "os://artifact/{sha256}"),
152            Self::Src(SourceSpec::Local { sha256 }) => write!(f, "os://src/local/{sha256}"),
153            Self::Src(SourceSpec::Gh {
154                owner,
155                repo,
156                r#ref,
157                path,
158            }) => write!(
159                f,
160                "os://src/gh/{owner}/{repo}/ref/{}/path/{}",
161                encode_ref(r#ref),
162                encode_path(path)
163            ),
164            Self::Src(SourceSpec::Gl {
165                project,
166                r#ref,
167                path,
168            }) => write!(
169                f,
170                "os://src/gl/{}/ref/{}/path/{}",
171                encode_b64(project),
172                encode_ref(r#ref),
173                encode_path(path)
174            ),
175            Self::Src(SourceSpec::Git {
176                remote,
177                r#ref,
178                path,
179            }) => write!(
180                f,
181                "os://src/git/{}/ref/{}/path/{}",
182                encode_b64(remote),
183                encode_ref(r#ref),
184                encode_path(path)
185            ),
186        }
187    }
188}
189
190fn parse_local(rest: &[&str]) -> Result<SourceUri, SourceUriError> {
191    if rest.len() != 1 {
192        return Err(SourceUriError::InvalidStructure(
193            "local uri must be os://src/local/<sha256>".to_string(),
194        ));
195    }
196    let hash = rest[0];
197    validate_sha256(hash)?;
198    Ok(SourceUri::Src(SourceSpec::Local {
199        sha256: hash.to_string(),
200    }))
201}
202
203fn parse_gh(rest: &[&str]) -> Result<SourceUri, SourceUriError> {
204    if rest.len() < 6 {
205        return Err(SourceUriError::InvalidStructure(
206            "gh uri must be os://src/gh/<owner>/<repo>/ref/<ref>/path/<path...>".to_string(),
207        ));
208    }
209    if rest[2] != "ref" || rest[4] != "path" {
210        return Err(SourceUriError::InvalidStructure(
211            "gh uri must contain /ref/<ref>/path/<path...>".to_string(),
212        ));
213    }
214    validate_owner_repo(rest[0])?;
215    validate_owner_repo(rest[1])?;
216    let decoded_ref = decode_ref(rest[3])?;
217    let path = decode_path(&rest[5..])?;
218    Ok(SourceUri::Src(SourceSpec::Gh {
219        owner: rest[0].to_string(),
220        repo: rest[1].to_string(),
221        r#ref: decoded_ref,
222        path,
223    }))
224}
225
226fn parse_gl(rest: &[&str]) -> Result<SourceUri, SourceUriError> {
227    if rest.len() < 5 {
228        return Err(SourceUriError::InvalidStructure(
229            "gl uri must be os://src/gl/<project_b64>/ref/<ref>/path/<path...>".to_string(),
230        ));
231    }
232    if rest[1] != "ref" || rest[3] != "path" {
233        return Err(SourceUriError::InvalidStructure(
234            "gl uri must contain /ref/<ref>/path/<path...>".to_string(),
235        ));
236    }
237    let project = decode_b64(rest[0])?;
238    let decoded_ref = decode_ref(rest[2])?;
239    let path = decode_path(&rest[4..])?;
240    Ok(SourceUri::Src(SourceSpec::Gl {
241        project,
242        r#ref: decoded_ref,
243        path,
244    }))
245}
246
247fn parse_git(rest: &[&str]) -> Result<SourceUri, SourceUriError> {
248    if rest.len() < 5 {
249        return Err(SourceUriError::InvalidStructure(
250            "git uri must be os://src/git/<remote_b64>/ref/<ref>/path/<path...>".to_string(),
251        ));
252    }
253    if rest[1] != "ref" || rest[3] != "path" {
254        return Err(SourceUriError::InvalidStructure(
255            "git uri must contain /ref/<ref>/path/<path...>".to_string(),
256        ));
257    }
258    let remote = decode_b64(rest[0])?;
259    let decoded_ref = decode_ref(rest[2])?;
260    let path = decode_path(&rest[4..])?;
261    Ok(SourceUri::Src(SourceSpec::Git {
262        remote,
263        r#ref: decoded_ref,
264        path,
265    }))
266}
267
268fn validate_sha256(hash: &str) -> Result<(), SourceUriError> {
269    let is_hex = hash.len() == 64 && hash.bytes().all(|b| b.is_ascii_hexdigit());
270    if !is_hex {
271        return Err(SourceUriError::InvalidHash(hash.to_string()));
272    }
273    Ok(())
274}
275
276fn validate_owner_repo(value: &str) -> Result<(), SourceUriError> {
277    static OWNER_REPO_RE: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
278        Regex::new(r"^[A-Za-z0-9._-]{1,200}$").expect("owner/repo regex should compile")
279    });
280    if OWNER_REPO_RE.is_match(value) {
281        Ok(())
282    } else {
283        Err(SourceUriError::InvalidStructure(format!(
284            "invalid owner/repo segment: {value}"
285        )))
286    }
287}
288
289fn encode_ref(value: &str) -> String {
290    urlencoding::encode(value).into_owned()
291}
292
293fn decode_ref(encoded: &str) -> Result<String, SourceUriError> {
294    let decoded = urlencoding::decode(encoded)
295        .map_err(|_| SourceUriError::InvalidRefEncoding(encoded.to_string()))?;
296    let trimmed = decoded.trim();
297    if trimmed.is_empty() {
298        return Err(SourceUriError::InvalidRefEncoding(encoded.to_string()));
299    }
300    Ok(trimmed.to_string())
301}
302
303fn encode_path(path: &str) -> String {
304    path.split('/')
305        .map(|segment| urlencoding::encode(segment).into_owned())
306        .collect::<Vec<_>>()
307        .join("/")
308}
309
310fn decode_path(segments: &[&str]) -> Result<String, SourceUriError> {
311    if segments.is_empty() {
312        return Err(SourceUriError::InvalidStructure(
313            "path segment is required".to_string(),
314        ));
315    }
316
317    let mut out = Vec::with_capacity(segments.len());
318    for encoded in segments {
319        let decoded = urlencoding::decode(encoded)
320            .map_err(|_| SourceUriError::InvalidPathEncoding((*encoded).to_string()))?;
321        let segment = decoded.trim();
322        if segment.is_empty() || segment == "." || segment == ".." || segment.contains('\\') {
323            return Err(SourceUriError::InvalidPathEncoding((*encoded).to_string()));
324        }
325        out.push(segment.to_string());
326    }
327    Ok(out.join("/"))
328}
329
330fn encode_b64(value: &str) -> String {
331    URL_SAFE_NO_PAD.encode(value.as_bytes())
332}
333
334fn decode_b64(value: &str) -> Result<String, SourceUriError> {
335    let bytes = URL_SAFE_NO_PAD
336        .decode(value.as_bytes())
337        .map_err(|_| SourceUriError::InvalidBase64(value.to_string()))?;
338    String::from_utf8(bytes).map_err(|_| SourceUriError::InvalidBase64(value.to_string()))
339}
340
341fn split_non_empty(value: &str) -> Vec<&str> {
342    value
343        .split('/')
344        .filter(|segment| !segment.is_empty())
345        .collect()
346}
347
348#[cfg(test)]
349mod tests {
350    use super::{SourceSpec, SourceUri};
351
352    #[test]
353    fn parses_local_uri() {
354        let hash = "a".repeat(64);
355        let parsed = SourceUri::parse(&format!("os://src/local/{hash}")).expect("parse local");
356        assert_eq!(
357            parsed,
358            SourceUri::Src(SourceSpec::Local {
359                sha256: hash.clone()
360            })
361        );
362        assert_eq!(parsed.to_string(), format!("os://src/local/{hash}"));
363    }
364
365    #[test]
366    fn parses_gh_roundtrip() {
367        let uri = SourceUri::Src(SourceSpec::Gh {
368            owner: "hwisu".to_string(),
369            repo: "opensession".to_string(),
370            r#ref: "refs/heads/feature/x".to_string(),
371            path: "sessions/abc.jsonl".to_string(),
372        });
373        let rendered = uri.to_string();
374        let parsed = SourceUri::parse(&rendered).expect("parse gh");
375        assert_eq!(parsed, uri);
376        assert_eq!(
377            parsed.to_web_path().as_deref(),
378            Some(
379                "/src/gh/hwisu/opensession/ref/refs%2Fheads%2Ffeature%2Fx/path/sessions/abc.jsonl"
380            )
381        );
382    }
383
384    #[test]
385    fn parses_gl_roundtrip() {
386        let uri = SourceUri::Src(SourceSpec::Gl {
387            project: "group/sub/repo".to_string(),
388            r#ref: "main".to_string(),
389            path: "dir/session.hail.jsonl".to_string(),
390        });
391        let rendered = uri.to_string();
392        let parsed = SourceUri::parse(&rendered).expect("parse gl");
393        assert_eq!(parsed, uri);
394    }
395
396    #[test]
397    fn parses_git_roundtrip() {
398        let uri = SourceUri::Src(SourceSpec::Git {
399            remote: "https://example.com/a/b.git".to_string(),
400            r#ref: "refs/opensession/branches/bWFpbg".to_string(),
401            path: "sessions/hash.jsonl".to_string(),
402        });
403        let rendered = uri.to_string();
404        let parsed = SourceUri::parse(&rendered).expect("parse git");
405        assert_eq!(parsed, uri);
406    }
407
408    #[test]
409    fn parses_artifact_uri() {
410        let hash = "f".repeat(64);
411        let parsed = SourceUri::parse(&format!("os://artifact/{hash}")).expect("parse artifact");
412        assert_eq!(parsed.to_string(), format!("os://artifact/{hash}"));
413    }
414
415    #[test]
416    fn rejects_invalid_hash() {
417        let err = SourceUri::parse("os://src/local/not-a-hash").expect_err("invalid hash");
418        assert!(
419            err.to_string().contains("invalid sha256"),
420            "unexpected error: {err}"
421        );
422    }
423}