Skip to main content

panproto_protocols/
raw_file.rs

1//! Raw file protocol for non-code files.
2//!
3//! Handles files that don't have a language protocol (README.md, LICENSE,
4//! .gitignore, images, Makefile, etc.) by representing them as ordered
5//! sequences of lines (text) or single opaque chunks (binary).
6//!
7//! ## Theory composition
8//!
9//! ```text
10//! ThRawFile = colimit(ThGraph, ThOrder, shared=ThVertexEdge)
11//! ```
12//!
13//! ## Vertex kinds
14//!
15//! - `file`: the root vertex representing the entire file
16//! - `line`: a single line of text (ordered via ThOrder)
17//! - `chunk`: an opaque binary blob
18//!
19//! ## Edge rules
20//!
21//! - `line-of`: file → line (ordered)
22//! - `chunk-of`: file → chunk
23//!
24//! ## Merge behavior
25//!
26//! Text files merge via pushout on ordered line sequences (the same algorithm
27//! as all other ordered schemas). Binary files are opaque (whole-file replacement).
28
29use std::collections::HashMap;
30use std::hash::BuildHasher;
31
32use panproto_gat::Theory;
33use panproto_schema::{EdgeRule, Protocol, Schema, SchemaBuilder};
34
35use crate::error::ProtocolError;
36use crate::theories;
37
38/// Returns the raw file protocol definition.
39#[must_use]
40pub fn protocol() -> Protocol {
41    Protocol {
42        name: "raw_file".into(),
43        schema_theory: "ThRawFileSchema".into(),
44        instance_theory: "ThRawFileInstance".into(),
45        schema_composition: None,
46        instance_composition: None,
47        edge_rules: vec![
48            EdgeRule {
49                edge_kind: "line-of".into(),
50                src_kinds: vec!["file".into()],
51                tgt_kinds: vec!["line".into()],
52            },
53            EdgeRule {
54                edge_kind: "chunk-of".into(),
55                src_kinds: vec!["file".into()],
56                tgt_kinds: vec!["chunk".into()],
57            },
58        ],
59        obj_kinds: vec!["file".into(), "line".into(), "chunk".into()],
60        constraint_sorts: vec![
61            "mime-type".into(),
62            "encoding".into(),
63            "line-number".into(),
64            "content".into(),
65            "content-length".into(),
66            "content-hash".into(),
67        ],
68        has_order: true,
69        has_coproducts: false,
70        has_recursion: false,
71        has_causal: false,
72        nominal_identity: false,
73        has_defaults: false,
74        has_coercions: false,
75        has_mergers: false,
76        has_policies: false,
77    }
78}
79
80/// Register the raw file theory pair.
81///
82/// Schema: `colimit(ThGraph, ThOrder, shared=ThVertexEdge)`.
83/// Instance: `ThWType`.
84pub fn register_theories<S: BuildHasher>(registry: &mut HashMap<String, Theory, S>) {
85    theories::register_constrained_multigraph_wtype(
86        registry,
87        "ThRawFileSchema",
88        "ThRawFileInstance",
89    );
90}
91
92/// Parse a text file into a raw file [`Schema`].
93///
94/// Each line becomes a `line` vertex connected to the root `file` vertex
95/// via a `line-of` edge. Lines are ordered via positional indices.
96///
97/// # Errors
98///
99/// Returns [`ProtocolError`] if schema construction fails.
100pub fn parse_text(input: &str, file_path: &str) -> Result<Schema, ProtocolError> {
101    let proto = protocol();
102    let mut builder = SchemaBuilder::new(&proto);
103
104    // Root file vertex.
105    let file_id = file_path;
106    builder = builder
107        .vertex(file_id, "file", None)
108        .map_err(|e| ProtocolError::Parse(format!("file vertex: {e}")))?;
109
110    // Detect mime type from extension.
111    let mime = mime_from_path(file_path);
112    builder = builder.constraint(file_id, "mime-type", &mime);
113    builder = builder.constraint(file_id, "encoding", "utf-8");
114
115    // One line vertex per line.
116    for (i, line_text) in input.lines().enumerate() {
117        let line_id = format!("{file_id}::line_{i}");
118        builder = builder
119            .vertex(&line_id, "line", None)
120            .map_err(|e| ProtocolError::Parse(format!("line {i}: {e}")))?;
121
122        builder = builder
123            .edge(file_id, &line_id, "line-of", None)
124            .map_err(|e| ProtocolError::Parse(format!("line-of edge {i}: {e}")))?;
125
126        builder = builder.constraint(&line_id, "content", line_text);
127        builder = builder.constraint(&line_id, "line-number", &i.to_string());
128    }
129
130    builder
131        .build()
132        .map_err(|e| ProtocolError::Parse(format!("build: {e}")))
133}
134
135/// Parse a binary file into a raw file [`Schema`].
136///
137/// The entire file becomes a single `chunk` vertex connected to the root
138/// `file` vertex via a `chunk-of` edge.
139///
140/// # Errors
141///
142/// Returns [`ProtocolError`] if schema construction fails.
143pub fn parse_binary(file_path: &str, content: &[u8]) -> Result<Schema, ProtocolError> {
144    let proto = protocol();
145    let mut builder = SchemaBuilder::new(&proto);
146
147    let file_id = file_path;
148    builder = builder
149        .vertex(file_id, "file", None)
150        .map_err(|e| ProtocolError::Parse(format!("file vertex: {e}")))?;
151
152    let mime = mime_from_path(file_path);
153    builder = builder.constraint(file_id, "mime-type", &mime);
154    builder = builder.constraint(file_id, "encoding", "binary");
155    builder = builder.constraint(file_id, "content-length", &content.len().to_string());
156
157    let chunk_id = format!("{file_id}::chunk_0");
158    builder = builder
159        .vertex(&chunk_id, "chunk", None)
160        .map_err(|e| ProtocolError::Parse(format!("chunk vertex: {e}")))?;
161
162    builder = builder
163        .edge(file_id, &chunk_id, "chunk-of", None)
164        .map_err(|e| ProtocolError::Parse(format!("chunk-of edge: {e}")))?;
165
166    // Store a content hash on the chunk vertex so the schema tracks identity
167    // of the binary content. The actual bytes are stored by the VCS object store,
168    // not inline in the schema (binary data can be arbitrarily large).
169    let hash = blake3::hash(content);
170    let hex = hash.to_hex();
171    builder = builder.constraint(&chunk_id, "content-hash", hex.as_str());
172
173    builder
174        .build()
175        .map_err(|e| ProtocolError::Parse(format!("build: {e}")))
176}
177
178/// Emit a raw file schema back to text.
179///
180/// Walks line vertices in order, joining them with newlines.
181///
182/// # Errors
183///
184/// Returns [`ProtocolError`] if the schema structure is invalid.
185pub fn emit_text(schema: &Schema) -> Result<String, ProtocolError> {
186    // Collect line vertices with their line numbers.
187    let mut lines: Vec<(usize, String)> = Vec::new();
188
189    for (name, vertex) in &schema.vertices {
190        if vertex.kind.as_ref() == "line" {
191            let line_num = schema
192                .constraints
193                .get(name)
194                .and_then(|cs| {
195                    cs.iter()
196                        .find(|c| c.sort.as_ref() == "line-number")
197                        .and_then(|c| c.value.parse::<usize>().ok())
198                })
199                .unwrap_or(lines.len());
200
201            let content = schema
202                .constraints
203                .get(name)
204                .and_then(|cs| {
205                    cs.iter()
206                        .find(|c| c.sort.as_ref() == "content")
207                        .map(|c| c.value.clone())
208                })
209                .unwrap_or_default();
210
211            lines.push((line_num, content));
212        }
213    }
214
215    // Sort by line number.
216    lines.sort_by_key(|(num, _)| *num);
217
218    let text: Vec<&str> = lines.iter().map(|(_, content)| content.as_str()).collect();
219    let mut result = text.join("\n");
220    if !result.is_empty() {
221        result.push('\n');
222    }
223    Ok(result)
224}
225
226/// Detect MIME type from file path extension.
227fn mime_from_path(path: &str) -> String {
228    // Only consider the part after the last dot as the extension.
229    // If there's no dot, there's no extension.
230    let ext = if path.contains('.') {
231        path.rsplit('.').next().unwrap_or("")
232    } else {
233        ""
234    };
235    match ext.to_lowercase().as_str() {
236        "md" | "markdown" => "text/markdown",
237        "txt" => "text/plain",
238        "json" => "application/json",
239        "yaml" | "yml" => "text/yaml",
240        "toml" => "text/toml",
241        "xml" => "application/xml",
242        "html" | "htm" => "text/html",
243        "css" => "text/css",
244        "svg" => "image/svg+xml",
245        "png" => "image/png",
246        "jpg" | "jpeg" => "image/jpeg",
247        "gif" => "image/gif",
248        "webp" => "image/webp",
249        "pdf" => "application/pdf",
250        "zip" => "application/zip",
251        "tar" => "application/x-tar",
252        "gz" => "application/gzip",
253        "wasm" => "application/wasm",
254        "sh" | "bash" => "text/x-shellscript",
255        "dockerfile" => "text/x-dockerfile",
256        "makefile" => "text/x-makefile",
257        "gitignore" => "text/plain",
258        "env" => "text/plain",
259        "lock" => "text/plain",
260        "cfg" | "ini" => "text/plain",
261        "csv" => "text/csv",
262        "tsv" => "text/tab-separated-values",
263        "log" => "text/plain",
264        _ => "application/octet-stream",
265    }
266    .to_owned()
267}
268
269#[cfg(test)]
270#[allow(clippy::unwrap_used)]
271mod tests {
272    use super::*;
273
274    #[test]
275    fn protocol_def() {
276        let proto = protocol();
277        assert_eq!(proto.name, "raw_file");
278        assert_eq!(proto.obj_kinds.len(), 3);
279        assert_eq!(proto.edge_rules.len(), 2);
280        assert!(proto.has_order);
281    }
282
283    #[test]
284    fn register_theories_works() {
285        let mut registry = HashMap::new();
286        register_theories(&mut registry);
287        assert!(registry.contains_key("ThRawFileSchema"));
288        assert!(registry.contains_key("ThRawFileInstance"));
289    }
290
291    #[test]
292    fn parse_text_file() {
293        let input = "Hello World\nSecond line\nThird line";
294        let schema = parse_text(input, "README.md").unwrap();
295
296        // 1 file + 3 lines = 4 vertices.
297        assert_eq!(schema.vertices.len(), 4);
298
299        // Check mime type constraint on file vertex.
300        let file_name: panproto_gat::Name = "README.md".into();
301        let constraints = schema.constraints.get(&file_name).unwrap();
302        let mime = constraints
303            .iter()
304            .find(|c| c.sort.as_ref() == "mime-type")
305            .unwrap();
306        assert_eq!(mime.value, "text/markdown");
307    }
308
309    #[test]
310    fn parse_and_emit_roundtrip() {
311        let input = "line one\nline two\nline three\n";
312        let schema = parse_text(input, "test.txt").unwrap();
313        let output = emit_text(&schema).unwrap();
314        assert_eq!(output, input);
315    }
316
317    #[test]
318    fn parse_empty_file() {
319        let input = "";
320        let schema = parse_text(input, "empty.txt").unwrap();
321        // Just the file vertex (no lines for empty input).
322        assert_eq!(schema.vertices.len(), 1);
323    }
324
325    #[test]
326    fn parse_binary_file() {
327        let schema = parse_binary("image.png", &[0x89, 0x50, 0x4E, 0x47]).unwrap();
328        assert_eq!(schema.vertices.len(), 2); // file + chunk
329
330        let file_name: panproto_gat::Name = "image.png".into();
331        let constraints = schema.constraints.get(&file_name).unwrap();
332        let mime = constraints
333            .iter()
334            .find(|c| c.sort.as_ref() == "mime-type")
335            .unwrap();
336        assert_eq!(mime.value, "image/png");
337
338        let encoding = constraints
339            .iter()
340            .find(|c| c.sort.as_ref() == "encoding")
341            .unwrap();
342        assert_eq!(encoding.value, "binary");
343    }
344
345    #[test]
346    fn mime_detection() {
347        assert_eq!(mime_from_path("README.md"), "text/markdown");
348        assert_eq!(mime_from_path("data.json"), "application/json");
349        assert_eq!(mime_from_path("photo.jpg"), "image/jpeg");
350        assert_eq!(mime_from_path("unknown.xyz"), "application/octet-stream");
351        // "Dockerfile" has no extension; rsplit('.').next() returns "Dockerfile"
352        // which doesn't match any known extension, so it's octet-stream.
353        assert_eq!(mime_from_path("Dockerfile"), "application/octet-stream");
354        assert_eq!(mime_from_path("app.dockerfile"), "text/x-dockerfile");
355    }
356}