use std::collections::HashMap;
use std::hash::BuildHasher;
use panproto_gat::Theory;
use panproto_schema::{EdgeRule, Protocol, Schema, SchemaBuilder};
use crate::error::ProtocolError;
use crate::theories;
#[must_use]
pub fn protocol() -> Protocol {
Protocol {
name: "raw_file".into(),
schema_theory: "ThRawFileSchema".into(),
instance_theory: "ThRawFileInstance".into(),
schema_composition: None,
instance_composition: None,
edge_rules: vec![
EdgeRule {
edge_kind: "line-of".into(),
src_kinds: vec!["file".into()],
tgt_kinds: vec!["line".into()],
},
EdgeRule {
edge_kind: "chunk-of".into(),
src_kinds: vec!["file".into()],
tgt_kinds: vec!["chunk".into()],
},
],
obj_kinds: vec!["file".into(), "line".into(), "chunk".into()],
constraint_sorts: vec![
"mime-type".into(),
"encoding".into(),
"line-number".into(),
"content".into(),
"content-length".into(),
"content-hash".into(),
],
has_order: true,
has_coproducts: false,
has_recursion: false,
has_causal: false,
nominal_identity: false,
has_defaults: false,
has_coercions: false,
has_mergers: false,
has_policies: false,
}
}
pub fn register_theories<S: BuildHasher>(registry: &mut HashMap<String, Theory, S>) {
theories::register_constrained_multigraph_wtype(
registry,
"ThRawFileSchema",
"ThRawFileInstance",
);
}
pub fn parse_text(input: &str, file_path: &str) -> Result<Schema, ProtocolError> {
let proto = protocol();
let mut builder = SchemaBuilder::new(&proto);
let file_id = file_path;
builder = builder
.vertex(file_id, "file", None)
.map_err(|e| ProtocolError::Parse(format!("file vertex: {e}")))?;
let mime = mime_from_path(file_path);
builder = builder.constraint(file_id, "mime-type", &mime);
builder = builder.constraint(file_id, "encoding", "utf-8");
for (i, line_text) in input.lines().enumerate() {
let line_id = format!("{file_id}::line_{i}");
builder = builder
.vertex(&line_id, "line", None)
.map_err(|e| ProtocolError::Parse(format!("line {i}: {e}")))?;
builder = builder
.edge(file_id, &line_id, "line-of", None)
.map_err(|e| ProtocolError::Parse(format!("line-of edge {i}: {e}")))?;
builder = builder.constraint(&line_id, "content", line_text);
builder = builder.constraint(&line_id, "line-number", &i.to_string());
}
builder
.build()
.map_err(|e| ProtocolError::Parse(format!("build: {e}")))
}
pub fn parse_binary(file_path: &str, content: &[u8]) -> Result<Schema, ProtocolError> {
let proto = protocol();
let mut builder = SchemaBuilder::new(&proto);
let file_id = file_path;
builder = builder
.vertex(file_id, "file", None)
.map_err(|e| ProtocolError::Parse(format!("file vertex: {e}")))?;
let mime = mime_from_path(file_path);
builder = builder.constraint(file_id, "mime-type", &mime);
builder = builder.constraint(file_id, "encoding", "binary");
builder = builder.constraint(file_id, "content-length", &content.len().to_string());
let chunk_id = format!("{file_id}::chunk_0");
builder = builder
.vertex(&chunk_id, "chunk", None)
.map_err(|e| ProtocolError::Parse(format!("chunk vertex: {e}")))?;
builder = builder
.edge(file_id, &chunk_id, "chunk-of", None)
.map_err(|e| ProtocolError::Parse(format!("chunk-of edge: {e}")))?;
let hash = blake3::hash(content);
let hex = hash.to_hex();
builder = builder.constraint(&chunk_id, "content-hash", hex.as_str());
builder
.build()
.map_err(|e| ProtocolError::Parse(format!("build: {e}")))
}
pub fn emit_text(schema: &Schema) -> Result<String, ProtocolError> {
let mut lines: Vec<(usize, String)> = Vec::new();
for (name, vertex) in &schema.vertices {
if vertex.kind.as_ref() == "line" {
let line_num = schema
.constraints
.get(name)
.and_then(|cs| {
cs.iter()
.find(|c| c.sort.as_ref() == "line-number")
.and_then(|c| c.value.parse::<usize>().ok())
})
.unwrap_or(lines.len());
let content = schema
.constraints
.get(name)
.and_then(|cs| {
cs.iter()
.find(|c| c.sort.as_ref() == "content")
.map(|c| c.value.clone())
})
.unwrap_or_default();
lines.push((line_num, content));
}
}
lines.sort_by_key(|(num, _)| *num);
let text: Vec<&str> = lines.iter().map(|(_, content)| content.as_str()).collect();
let mut result = text.join("\n");
if !result.is_empty() {
result.push('\n');
}
Ok(result)
}
fn mime_from_path(path: &str) -> String {
let ext = if path.contains('.') {
path.rsplit('.').next().unwrap_or("")
} else {
""
};
match ext.to_lowercase().as_str() {
"md" | "markdown" => "text/markdown",
"txt" => "text/plain",
"json" => "application/json",
"yaml" | "yml" => "text/yaml",
"toml" => "text/toml",
"xml" => "application/xml",
"html" | "htm" => "text/html",
"css" => "text/css",
"svg" => "image/svg+xml",
"png" => "image/png",
"jpg" | "jpeg" => "image/jpeg",
"gif" => "image/gif",
"webp" => "image/webp",
"pdf" => "application/pdf",
"zip" => "application/zip",
"tar" => "application/x-tar",
"gz" => "application/gzip",
"wasm" => "application/wasm",
"sh" | "bash" => "text/x-shellscript",
"dockerfile" => "text/x-dockerfile",
"makefile" => "text/x-makefile",
"gitignore" => "text/plain",
"env" => "text/plain",
"lock" => "text/plain",
"cfg" | "ini" => "text/plain",
"csv" => "text/csv",
"tsv" => "text/tab-separated-values",
"log" => "text/plain",
_ => "application/octet-stream",
}
.to_owned()
}
#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
use super::*;
#[test]
fn protocol_def() {
let proto = protocol();
assert_eq!(proto.name, "raw_file");
assert_eq!(proto.obj_kinds.len(), 3);
assert_eq!(proto.edge_rules.len(), 2);
assert!(proto.has_order);
}
#[test]
fn register_theories_works() {
let mut registry = HashMap::new();
register_theories(&mut registry);
assert!(registry.contains_key("ThRawFileSchema"));
assert!(registry.contains_key("ThRawFileInstance"));
}
#[test]
fn parse_text_file() {
let input = "Hello World\nSecond line\nThird line";
let schema = parse_text(input, "README.md").unwrap();
assert_eq!(schema.vertices.len(), 4);
let file_name: panproto_gat::Name = "README.md".into();
let constraints = schema.constraints.get(&file_name).unwrap();
let mime = constraints
.iter()
.find(|c| c.sort.as_ref() == "mime-type")
.unwrap();
assert_eq!(mime.value, "text/markdown");
}
#[test]
fn parse_and_emit_roundtrip() {
let input = "line one\nline two\nline three\n";
let schema = parse_text(input, "test.txt").unwrap();
let output = emit_text(&schema).unwrap();
assert_eq!(output, input);
}
#[test]
fn parse_empty_file() {
let input = "";
let schema = parse_text(input, "empty.txt").unwrap();
assert_eq!(schema.vertices.len(), 1);
}
#[test]
fn parse_binary_file() {
let schema = parse_binary("image.png", &[0x89, 0x50, 0x4E, 0x47]).unwrap();
assert_eq!(schema.vertices.len(), 2);
let file_name: panproto_gat::Name = "image.png".into();
let constraints = schema.constraints.get(&file_name).unwrap();
let mime = constraints
.iter()
.find(|c| c.sort.as_ref() == "mime-type")
.unwrap();
assert_eq!(mime.value, "image/png");
let encoding = constraints
.iter()
.find(|c| c.sort.as_ref() == "encoding")
.unwrap();
assert_eq!(encoding.value, "binary");
}
#[test]
fn mime_detection() {
assert_eq!(mime_from_path("README.md"), "text/markdown");
assert_eq!(mime_from_path("data.json"), "application/json");
assert_eq!(mime_from_path("photo.jpg"), "image/jpeg");
assert_eq!(mime_from_path("unknown.xyz"), "application/octet-stream");
assert_eq!(mime_from_path("Dockerfile"), "application/octet-stream");
assert_eq!(mime_from_path("app.dockerfile"), "text/x-dockerfile");
}
}