use std::collections::HashMap;
use std::hash::BuildHasher;
use std::path::{Path, PathBuf};
use panproto_project::ProjectBuilder;
use panproto_vcs::{CommitObject, FileSchemaObject, Object, ObjectId, Store};
use rustc_hash::FxHashMap;
use crate::error::GitBridgeError;
pub const BLOB_CACHE_FILE: &str = "blob_to_schema";
#[derive(Debug, thiserror::Error)]
pub enum BlobCacheLoadError {
#[error("blob cache at {path} is corrupt at line {line}: {reason}")]
Corrupt {
path: String,
line: usize,
reason: String,
},
#[error("blob cache at {path}: {source}")]
Io {
path: String,
#[source]
source: std::io::Error,
},
}
pub fn load_blob_cache(path: &Path) -> Result<BlobSchemaCache, BlobCacheLoadError> {
let content = match std::fs::read_to_string(path) {
Ok(c) => c,
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
return Ok(BlobSchemaCache::default());
}
Err(source) => {
return Err(BlobCacheLoadError::Io {
path: path.display().to_string(),
source,
});
}
};
let mut map = BlobSchemaCache::default();
for (idx, line) in content.lines().enumerate() {
if line.trim().is_empty() {
continue;
}
let mut parts = line.split_whitespace();
let Some(blob_hex) = parts.next() else {
continue;
};
let Some(protocol) = parts.next() else {
return Err(BlobCacheLoadError::Corrupt {
path: path.display().to_string(),
line: idx + 1,
reason: "missing protocol slot; delete the cache file and reimport".to_owned(),
});
};
let Some(panproto_hex) = parts.next() else {
return Err(BlobCacheLoadError::Corrupt {
path: path.display().to_string(),
line: idx + 1,
reason: "missing panproto id".to_owned(),
});
};
let blob_oid = git2::Oid::from_str(blob_hex).map_err(|e| BlobCacheLoadError::Corrupt {
path: path.display().to_string(),
line: idx + 1,
reason: format!("bad git oid: {e}"),
})?;
let panproto_id =
panproto_hex
.parse::<ObjectId>()
.map_err(|e| BlobCacheLoadError::Corrupt {
path: path.display().to_string(),
line: idx + 1,
reason: format!("bad panproto id: {e}"),
})?;
map.insert((blob_oid, protocol.to_owned()), panproto_id);
}
Ok(map)
}
pub fn save_blob_cache(path: &Path, cache: &BlobSchemaCache) -> std::io::Result<()> {
use std::io::Write;
let parent = path.parent().ok_or_else(|| {
std::io::Error::new(
std::io::ErrorKind::InvalidInput,
"blob cache path has no parent directory",
)
})?;
std::fs::create_dir_all(parent)?;
let mut lines: Vec<String> = Vec::with_capacity(cache.len());
for ((blob, protocol), id) in cache {
if protocol.is_empty() {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
format!(
"blob cache entry for {blob} has empty protocol; every entry must carry a protocol name"
),
));
}
lines.push(format!("{blob} {protocol} {id}"));
}
lines.sort();
let body = lines.join("\n") + "\n";
let tmp = path.with_extension("tmp");
{
let mut f = std::fs::File::create(&tmp)?;
f.write_all(body.as_bytes())?;
f.sync_all()?;
}
std::fs::rename(&tmp, path)?;
let dir = std::fs::File::open(parent)?;
dir.sync_all()?;
Ok(())
}
pub type BlobSchemaCache = FxHashMap<(git2::Oid, String), ObjectId>;
#[derive(Debug)]
pub struct ImportResult {
pub commit_count: usize,
pub head_id: ObjectId,
pub oid_map: Vec<(git2::Oid, ObjectId)>,
}
pub fn import_git_repo<S: Store>(
git_repo: &git2::Repository,
panproto_store: &mut S,
revspec: &str,
) -> Result<ImportResult, GitBridgeError> {
import_git_repo_incremental(git_repo, panproto_store, revspec, &FxHashMap::default())
}
pub fn import_git_repo_persistent<S: Store, H: BuildHasher>(
git_repo: &git2::Repository,
panproto_store: &mut S,
revspec: &str,
known: &HashMap<git2::Oid, ObjectId, H>,
cache_dir: &Path,
) -> Result<ImportResult, GitBridgeError> {
let cache_path = cache_dir.join(BLOB_CACHE_FILE);
let mut cache =
load_blob_cache(&cache_path).map_err(|e| GitBridgeError::BlobCache(e.to_string()))?;
let result = import_git_repo_with_cache(git_repo, panproto_store, revspec, known, &mut cache)?;
save_blob_cache(&cache_path, &cache).map_err(|e| GitBridgeError::BlobCache(e.to_string()))?;
Ok(result)
}
pub fn import_git_repo_incremental<S: Store, H: BuildHasher>(
git_repo: &git2::Repository,
panproto_store: &mut S,
revspec: &str,
known: &HashMap<git2::Oid, ObjectId, H>,
) -> Result<ImportResult, GitBridgeError> {
let mut cache = BlobSchemaCache::default();
import_git_repo_with_cache(git_repo, panproto_store, revspec, known, &mut cache)
}
pub fn import_git_repo_with_cache<S, H>(
git_repo: &git2::Repository,
panproto_store: &mut S,
revspec: &str,
known: &HashMap<git2::Oid, ObjectId, H>,
blob_cache: &mut BlobSchemaCache,
) -> Result<ImportResult, GitBridgeError>
where
S: Store,
H: BuildHasher,
{
let obj = git_repo.revparse_single(revspec)?;
let head_commit = obj
.peel_to_commit()
.map_err(|e| GitBridgeError::ObjectRead {
oid: obj.id().to_string(),
reason: format!("not a commit: {e}"),
})?;
let head_git_oid = head_commit.id();
let mut commits = Vec::new();
collect_new_ancestors(git_repo, head_git_oid, known, &mut commits)?;
let mut git_to_panproto: FxHashMap<git2::Oid, ObjectId> =
known.iter().map(|(&k, &v)| (k, v)).collect();
let mut oid_map: Vec<(git2::Oid, ObjectId)> = Vec::new();
let mut last_id = ObjectId::ZERO;
for git_oid in &commits {
let git_commit = git_repo.find_commit(*git_oid)?;
let tree = git_commit.tree()?;
let mut leaves: Vec<(PathBuf, ObjectId)> = Vec::new();
collect_tree_leaves(
git_repo,
&tree,
Path::new(""),
panproto_store,
blob_cache,
&mut leaves,
)?;
let root_id = if leaves.is_empty() {
let proto = panproto_protocols::raw_file::protocol();
let schema = panproto_schema::SchemaBuilder::new(&proto)
.vertex("root", "file", None)
.map_err(|e| {
GitBridgeError::Project(panproto_project::ProjectError::CoproductFailed {
reason: format!("empty tree schema: {e}"),
})
})?
.build()
.map_err(|e| {
GitBridgeError::Project(panproto_project::ProjectError::CoproductFailed {
reason: format!("empty tree build: {e}"),
})
})?;
let file = FileSchemaObject {
path: "__empty__".to_owned(),
protocol: "raw_file".to_owned(),
schema,
cross_file_edges: Vec::new(),
};
let leaf_id = panproto_store.put(&Object::FileSchema(Box::new(file)))?;
panproto_vcs::build_tree_from_leaves(
panproto_store,
vec![(PathBuf::from("__empty__"), leaf_id)],
)
.map_err(GitBridgeError::Vcs)?
} else {
panproto_vcs::build_tree_from_leaves(panproto_store, leaves)
.map_err(GitBridgeError::Vcs)?
};
let parents: Vec<ObjectId> = git_commit
.parent_ids()
.filter_map(|parent_oid| git_to_panproto.get(&parent_oid).copied())
.collect();
let author_sig = git_commit.author();
let author = author_sig.name().unwrap_or("unknown").to_owned();
let timestamp = u64::try_from(author_sig.when().seconds()).unwrap_or(0);
let message = git_commit.message().unwrap_or("(no message)").to_owned();
let commit = CommitObject::builder(root_id, "project", &author, &message)
.parents(parents)
.timestamp(timestamp)
.build();
let commit_id = panproto_store.put(&Object::Commit(commit))?;
git_to_panproto.insert(*git_oid, commit_id);
oid_map.push((*git_oid, commit_id));
last_id = commit_id;
}
if commits.is_empty() {
if let Some(&id) = known.get(&head_git_oid) {
last_id = id;
}
}
Ok(ImportResult {
commit_count: commits.len(),
head_id: last_id,
oid_map,
})
}
fn collect_tree_leaves<S: Store>(
repo: &git2::Repository,
tree: &git2::Tree<'_>,
prefix: &Path,
store: &mut S,
blob_cache: &mut BlobSchemaCache,
leaves: &mut Vec<(PathBuf, ObjectId)>,
) -> Result<(), GitBridgeError> {
for entry in tree {
let name = entry
.name()
.ok_or_else(|| GitBridgeError::NonUtf8TreeEntry {
parent: prefix.display().to_string(),
})?;
let path = prefix.join(name);
match entry.kind() {
Some(git2::ObjectType::Blob) => {
let blob_oid = entry.id();
let protocol_guess = panproto_project::detect::detect_language(
&path,
&panproto_parse::ParserRegistry::new(),
)
.map_or_else(String::new, ToOwned::to_owned);
let leaf_id =
if let Some(&cached) = blob_cache.get(&(blob_oid, protocol_guess.clone())) {
cached
} else {
let blob = repo.find_blob(blob_oid)?;
let content = blob.content();
let (schema, protocol) = parse_single_blob(&path, content)?;
let file = FileSchemaObject {
path: path.display().to_string(),
protocol: protocol.clone(),
schema,
cross_file_edges: Vec::new(),
};
let id = store.put(&Object::FileSchema(Box::new(file)))?;
blob_cache.insert((blob_oid, protocol.clone()), id);
if protocol != protocol_guess {
blob_cache.insert((blob_oid, protocol_guess), id);
}
id
};
leaves.push((path, leaf_id));
}
Some(git2::ObjectType::Tree) => {
let subtree = repo.find_tree(entry.id())?;
collect_tree_leaves(repo, &subtree, &path, store, blob_cache, leaves)?;
}
_ => {}
}
}
Ok(())
}
fn parse_single_blob(
path: &Path,
content: &[u8],
) -> Result<(panproto_schema::Schema, String), GitBridgeError> {
let mut builder = ProjectBuilder::new();
builder.add_file(path, content)?;
let schemas = builder.file_schemas().clone();
let protocols = builder.protocol_map_ref().clone();
let schema = schemas.into_iter().next().map(|(_, s)| s).ok_or_else(|| {
GitBridgeError::Project(panproto_project::ProjectError::CoproductFailed {
reason: "single-blob parse produced no schema".to_owned(),
})
})?;
let protocol = protocols
.into_iter()
.next()
.map_or_else(|| "raw_file".to_owned(), |(_, p)| p);
Ok((schema, protocol))
}
fn collect_new_ancestors<H: BuildHasher>(
repo: &git2::Repository,
head: git2::Oid,
known: &HashMap<git2::Oid, ObjectId, H>,
result: &mut Vec<git2::Oid>,
) -> Result<(), GitBridgeError> {
let mut revwalk = repo.revwalk()?;
revwalk.push(head)?;
revwalk.set_sorting(git2::Sort::TOPOLOGICAL | git2::Sort::REVERSE)?;
for git_oid in known.keys() {
let _ = revwalk.hide(*git_oid);
}
for oid_result in revwalk {
result.push(oid_result?);
}
Ok(())
}