Skip to main content

panproto_protocols/
raw_file.rs

1//! Raw file protocol for non-code files.
2//!
3//! Handles files that don't have a language protocol (README.md, LICENSE,
4//! .gitignore, images, Makefile, etc.) by representing them as ordered
5//! sequences of lines (text) or single opaque chunks (binary).
6//!
7//! ## Theory composition
8//!
9//! ```text
10//! ThRawFile = colimit(ThGraph, ThOrder, shared=ThVertexEdge)
11//! ```
12//!
13//! ## Vertex kinds
14//!
15//! - `file`: the root vertex representing the entire file
16//! - `line`: a single line of text (ordered via ThOrder)
17//! - `chunk`: an opaque binary blob
18//!
19//! ## Edge rules
20//!
21//! - `line-of`: file → line (ordered)
22//! - `chunk-of`: file → chunk
23//!
24//! ## Merge behavior
25//!
26//! Text files merge via pushout on ordered line sequences (the same algorithm
27//! as all other ordered schemas). Binary files are opaque (whole-file replacement).
28
29use std::collections::HashMap;
30use std::hash::BuildHasher;
31
32use panproto_gat::Theory;
33use panproto_schema::{EdgeRule, Protocol, Schema, SchemaBuilder};
34
35use crate::error::ProtocolError;
36use crate::theories;
37
38/// Returns the raw file protocol definition.
39#[must_use]
40pub fn protocol() -> Protocol {
41    Protocol {
42        name: "raw_file".into(),
43        schema_theory: "ThRawFileSchema".into(),
44        instance_theory: "ThRawFileInstance".into(),
45        edge_rules: vec![
46            EdgeRule {
47                edge_kind: "line-of".into(),
48                src_kinds: vec!["file".into()],
49                tgt_kinds: vec!["line".into()],
50            },
51            EdgeRule {
52                edge_kind: "chunk-of".into(),
53                src_kinds: vec!["file".into()],
54                tgt_kinds: vec!["chunk".into()],
55            },
56        ],
57        obj_kinds: vec!["file".into(), "line".into(), "chunk".into()],
58        constraint_sorts: vec![
59            "mime-type".into(),
60            "encoding".into(),
61            "line-number".into(),
62            "content".into(),
63            "content-length".into(),
64            "content-hash".into(),
65        ],
66        has_order: true,
67        has_coproducts: false,
68        has_recursion: false,
69        has_causal: false,
70        nominal_identity: false,
71        has_defaults: false,
72        has_coercions: false,
73        has_mergers: false,
74        has_policies: false,
75    }
76}
77
78/// Register the raw file theory pair.
79///
80/// Schema: `colimit(ThGraph, ThOrder, shared=ThVertexEdge)`.
81/// Instance: `ThWType`.
82pub fn register_theories<S: BuildHasher>(registry: &mut HashMap<String, Theory, S>) {
83    theories::register_constrained_multigraph_wtype(
84        registry,
85        "ThRawFileSchema",
86        "ThRawFileInstance",
87    );
88}
89
90/// Parse a text file into a raw file [`Schema`].
91///
92/// Each line becomes a `line` vertex connected to the root `file` vertex
93/// via a `line-of` edge. Lines are ordered via positional indices.
94///
95/// # Errors
96///
97/// Returns [`ProtocolError`] if schema construction fails.
98pub fn parse_text(input: &str, file_path: &str) -> Result<Schema, ProtocolError> {
99    let proto = protocol();
100    let mut builder = SchemaBuilder::new(&proto);
101
102    // Root file vertex.
103    let file_id = file_path;
104    builder = builder
105        .vertex(file_id, "file", None)
106        .map_err(|e| ProtocolError::Parse(format!("file vertex: {e}")))?;
107
108    // Detect mime type from extension.
109    let mime = mime_from_path(file_path);
110    builder = builder.constraint(file_id, "mime-type", &mime);
111    builder = builder.constraint(file_id, "encoding", "utf-8");
112
113    // One line vertex per line.
114    for (i, line_text) in input.lines().enumerate() {
115        let line_id = format!("{file_id}::line_{i}");
116        builder = builder
117            .vertex(&line_id, "line", None)
118            .map_err(|e| ProtocolError::Parse(format!("line {i}: {e}")))?;
119
120        builder = builder
121            .edge(file_id, &line_id, "line-of", None)
122            .map_err(|e| ProtocolError::Parse(format!("line-of edge {i}: {e}")))?;
123
124        builder = builder.constraint(&line_id, "content", line_text);
125        builder = builder.constraint(&line_id, "line-number", &i.to_string());
126    }
127
128    builder
129        .build()
130        .map_err(|e| ProtocolError::Parse(format!("build: {e}")))
131}
132
133/// Parse a binary file into a raw file [`Schema`].
134///
135/// The entire file becomes a single `chunk` vertex connected to the root
136/// `file` vertex via a `chunk-of` edge.
137///
138/// # Errors
139///
140/// Returns [`ProtocolError`] if schema construction fails.
141pub fn parse_binary(file_path: &str, content: &[u8]) -> Result<Schema, ProtocolError> {
142    let proto = protocol();
143    let mut builder = SchemaBuilder::new(&proto);
144
145    let file_id = file_path;
146    builder = builder
147        .vertex(file_id, "file", None)
148        .map_err(|e| ProtocolError::Parse(format!("file vertex: {e}")))?;
149
150    let mime = mime_from_path(file_path);
151    builder = builder.constraint(file_id, "mime-type", &mime);
152    builder = builder.constraint(file_id, "encoding", "binary");
153    builder = builder.constraint(file_id, "content-length", &content.len().to_string());
154
155    let chunk_id = format!("{file_id}::chunk_0");
156    builder = builder
157        .vertex(&chunk_id, "chunk", None)
158        .map_err(|e| ProtocolError::Parse(format!("chunk vertex: {e}")))?;
159
160    builder = builder
161        .edge(file_id, &chunk_id, "chunk-of", None)
162        .map_err(|e| ProtocolError::Parse(format!("chunk-of edge: {e}")))?;
163
164    // Store a content hash on the chunk vertex so the schema tracks identity
165    // of the binary content. The actual bytes are stored by the VCS object store,
166    // not inline in the schema (binary data can be arbitrarily large).
167    let hash = blake3::hash(content);
168    let hex = hash.to_hex();
169    builder = builder.constraint(&chunk_id, "content-hash", hex.as_str());
170
171    builder
172        .build()
173        .map_err(|e| ProtocolError::Parse(format!("build: {e}")))
174}
175
176/// Emit a raw file schema back to text.
177///
178/// Walks line vertices in order, joining them with newlines.
179///
180/// # Errors
181///
182/// Returns [`ProtocolError`] if the schema structure is invalid.
183pub fn emit_text(schema: &Schema) -> Result<String, ProtocolError> {
184    // Collect line vertices with their line numbers.
185    let mut lines: Vec<(usize, String)> = Vec::new();
186
187    for (name, vertex) in &schema.vertices {
188        if vertex.kind.as_ref() == "line" {
189            let line_num = schema
190                .constraints
191                .get(name)
192                .and_then(|cs| {
193                    cs.iter()
194                        .find(|c| c.sort.as_ref() == "line-number")
195                        .and_then(|c| c.value.parse::<usize>().ok())
196                })
197                .unwrap_or(lines.len());
198
199            let content = schema
200                .constraints
201                .get(name)
202                .and_then(|cs| {
203                    cs.iter()
204                        .find(|c| c.sort.as_ref() == "content")
205                        .map(|c| c.value.clone())
206                })
207                .unwrap_or_default();
208
209            lines.push((line_num, content));
210        }
211    }
212
213    // Sort by line number.
214    lines.sort_by_key(|(num, _)| *num);
215
216    let text: Vec<&str> = lines.iter().map(|(_, content)| content.as_str()).collect();
217    let mut result = text.join("\n");
218    if !result.is_empty() {
219        result.push('\n');
220    }
221    Ok(result)
222}
223
224/// Detect MIME type from file path extension.
225fn mime_from_path(path: &str) -> String {
226    // Only consider the part after the last dot as the extension.
227    // If there's no dot, there's no extension.
228    let ext = if path.contains('.') {
229        path.rsplit('.').next().unwrap_or("")
230    } else {
231        ""
232    };
233    match ext.to_lowercase().as_str() {
234        "md" | "markdown" => "text/markdown",
235        "txt" => "text/plain",
236        "json" => "application/json",
237        "yaml" | "yml" => "text/yaml",
238        "toml" => "text/toml",
239        "xml" => "application/xml",
240        "html" | "htm" => "text/html",
241        "css" => "text/css",
242        "svg" => "image/svg+xml",
243        "png" => "image/png",
244        "jpg" | "jpeg" => "image/jpeg",
245        "gif" => "image/gif",
246        "webp" => "image/webp",
247        "pdf" => "application/pdf",
248        "zip" => "application/zip",
249        "tar" => "application/x-tar",
250        "gz" => "application/gzip",
251        "wasm" => "application/wasm",
252        "sh" | "bash" => "text/x-shellscript",
253        "dockerfile" => "text/x-dockerfile",
254        "makefile" => "text/x-makefile",
255        "gitignore" => "text/plain",
256        "env" => "text/plain",
257        "lock" => "text/plain",
258        "cfg" | "ini" => "text/plain",
259        "csv" => "text/csv",
260        "tsv" => "text/tab-separated-values",
261        "log" => "text/plain",
262        _ => "application/octet-stream",
263    }
264    .to_owned()
265}
266
267#[cfg(test)]
268#[allow(clippy::unwrap_used)]
269mod tests {
270    use super::*;
271
272    #[test]
273    fn protocol_def() {
274        let proto = protocol();
275        assert_eq!(proto.name, "raw_file");
276        assert_eq!(proto.obj_kinds.len(), 3);
277        assert_eq!(proto.edge_rules.len(), 2);
278        assert!(proto.has_order);
279    }
280
281    #[test]
282    fn register_theories_works() {
283        let mut registry = HashMap::new();
284        register_theories(&mut registry);
285        assert!(registry.contains_key("ThRawFileSchema"));
286        assert!(registry.contains_key("ThRawFileInstance"));
287    }
288
289    #[test]
290    fn parse_text_file() {
291        let input = "Hello World\nSecond line\nThird line";
292        let schema = parse_text(input, "README.md").unwrap();
293
294        // 1 file + 3 lines = 4 vertices.
295        assert_eq!(schema.vertices.len(), 4);
296
297        // Check mime type constraint on file vertex.
298        let file_name: panproto_gat::Name = "README.md".into();
299        let constraints = schema.constraints.get(&file_name).unwrap();
300        let mime = constraints
301            .iter()
302            .find(|c| c.sort.as_ref() == "mime-type")
303            .unwrap();
304        assert_eq!(mime.value, "text/markdown");
305    }
306
307    #[test]
308    fn parse_and_emit_roundtrip() {
309        let input = "line one\nline two\nline three\n";
310        let schema = parse_text(input, "test.txt").unwrap();
311        let output = emit_text(&schema).unwrap();
312        assert_eq!(output, input);
313    }
314
315    #[test]
316    fn parse_empty_file() {
317        let input = "";
318        let schema = parse_text(input, "empty.txt").unwrap();
319        // Just the file vertex (no lines for empty input).
320        assert_eq!(schema.vertices.len(), 1);
321    }
322
323    #[test]
324    fn parse_binary_file() {
325        let schema = parse_binary("image.png", &[0x89, 0x50, 0x4E, 0x47]).unwrap();
326        assert_eq!(schema.vertices.len(), 2); // file + chunk
327
328        let file_name: panproto_gat::Name = "image.png".into();
329        let constraints = schema.constraints.get(&file_name).unwrap();
330        let mime = constraints
331            .iter()
332            .find(|c| c.sort.as_ref() == "mime-type")
333            .unwrap();
334        assert_eq!(mime.value, "image/png");
335
336        let encoding = constraints
337            .iter()
338            .find(|c| c.sort.as_ref() == "encoding")
339            .unwrap();
340        assert_eq!(encoding.value, "binary");
341    }
342
343    #[test]
344    fn mime_detection() {
345        assert_eq!(mime_from_path("README.md"), "text/markdown");
346        assert_eq!(mime_from_path("data.json"), "application/json");
347        assert_eq!(mime_from_path("photo.jpg"), "image/jpeg");
348        assert_eq!(mime_from_path("unknown.xyz"), "application/octet-stream");
349        // "Dockerfile" has no extension; rsplit('.').next() returns "Dockerfile"
350        // which doesn't match any known extension, so it's octet-stream.
351        assert_eq!(mime_from_path("Dockerfile"), "application/octet-stream");
352        assert_eq!(mime_from_path("app.dockerfile"), "text/x-dockerfile");
353    }
354}