panproto_git/import.rs
1//! Import git repositories into panproto-vcs.
2//!
3//! Walks the git commit DAG topologically, parses each commit's file tree
4//! into a panproto project schema, and creates panproto-vcs commits that
5//! preserve authorship, timestamps, and parent structure.
6
7use std::collections::HashMap;
8use std::hash::BuildHasher;
9use std::path::{Path, PathBuf};
10
11use panproto_project::ProjectBuilder;
12use panproto_vcs::{CommitObject, FileSchemaObject, Object, ObjectId, Store};
13use rustc_hash::FxHashMap;
14
15use crate::error::GitBridgeError;
16
17/// Standard on-disk name of the blob-OID to `FileSchema`
18/// [`ObjectId`] cache.
19pub const BLOB_CACHE_FILE: &str = "blob_to_schema";
20
21/// Error loading a blob-to-schema cache.
22#[derive(Debug, thiserror::Error)]
23pub enum BlobCacheLoadError {
24 /// The cache file exists but could not be parsed.
25 #[error("blob cache at {path} is corrupt at line {line}: {reason}")]
26 Corrupt {
27 /// Display of the cache file path.
28 path: String,
29 /// 1-based line number of the first malformed entry.
30 line: usize,
31 /// What went wrong on that line.
32 reason: String,
33 },
34
35 /// An I/O error occurred while reading the cache.
36 #[error("blob cache at {path}: {source}")]
37 Io {
38 /// Display of the cache file path.
39 path: String,
40 /// The underlying I/O error.
41 #[source]
42 source: std::io::Error,
43 },
44}
45
46/// Load a blob-to-schema cache from a plain-text file.
47///
48/// File format: one entry per line,
49/// `<git_blob_oid> <protocol_name> <file_schema_panproto_id>`. Every
50/// entry carries a non-empty protocol slot; a line with a missing or
51/// empty protocol is rejected as corrupt so the caller can delete the
52/// file and reimport rather than round-trip through an empty slot.
53///
54/// # Errors
55///
56/// Returns [`BlobCacheLoadError::Io`] for I/O problems other than a
57/// missing file (missing yields an empty cache), and
58/// [`BlobCacheLoadError::Corrupt`] if any line cannot be parsed.
59pub fn load_blob_cache(path: &Path) -> Result<BlobSchemaCache, BlobCacheLoadError> {
60 let content = match std::fs::read_to_string(path) {
61 Ok(c) => c,
62 Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
63 return Ok(BlobSchemaCache::default());
64 }
65 Err(source) => {
66 return Err(BlobCacheLoadError::Io {
67 path: path.display().to_string(),
68 source,
69 });
70 }
71 };
72 let mut map = BlobSchemaCache::default();
73 for (idx, line) in content.lines().enumerate() {
74 if line.trim().is_empty() {
75 continue;
76 }
77 let mut parts = line.split_whitespace();
78 let Some(blob_hex) = parts.next() else {
79 continue;
80 };
81 let Some(protocol) = parts.next() else {
82 return Err(BlobCacheLoadError::Corrupt {
83 path: path.display().to_string(),
84 line: idx + 1,
85 reason: "missing protocol slot; delete the cache file and reimport".to_owned(),
86 });
87 };
88 let Some(panproto_hex) = parts.next() else {
89 return Err(BlobCacheLoadError::Corrupt {
90 path: path.display().to_string(),
91 line: idx + 1,
92 reason: "missing panproto id".to_owned(),
93 });
94 };
95 let blob_oid = git2::Oid::from_str(blob_hex).map_err(|e| BlobCacheLoadError::Corrupt {
96 path: path.display().to_string(),
97 line: idx + 1,
98 reason: format!("bad git oid: {e}"),
99 })?;
100 let panproto_id =
101 panproto_hex
102 .parse::<ObjectId>()
103 .map_err(|e| BlobCacheLoadError::Corrupt {
104 path: path.display().to_string(),
105 line: idx + 1,
106 reason: format!("bad panproto id: {e}"),
107 })?;
108 map.insert((blob_oid, protocol.to_owned()), panproto_id);
109 }
110 Ok(map)
111}
112
113/// Persist a blob-to-schema cache atomically.
114///
115/// Writes to `<path>.tmp` and renames into place, so a crash mid-write
116/// cannot leave a partial file that would later parse as corrupt.
117///
118/// # Errors
119///
120/// Returns any I/O error encountered while creating parent
121/// directories, writing, or renaming.
122pub fn save_blob_cache(path: &Path, cache: &BlobSchemaCache) -> std::io::Result<()> {
123 use std::io::Write;
124 let parent = path.parent().ok_or_else(|| {
125 std::io::Error::new(
126 std::io::ErrorKind::InvalidInput,
127 "blob cache path has no parent directory",
128 )
129 })?;
130 std::fs::create_dir_all(parent)?;
131 let mut lines: Vec<String> = Vec::with_capacity(cache.len());
132 for ((blob, protocol), id) in cache {
133 if protocol.is_empty() {
134 return Err(std::io::Error::new(
135 std::io::ErrorKind::InvalidInput,
136 format!(
137 "blob cache entry for {blob} has empty protocol; every entry must carry a protocol name"
138 ),
139 ));
140 }
141 lines.push(format!("{blob} {protocol} {id}"));
142 }
143 lines.sort();
144 let body = lines.join("\n") + "\n";
145 let tmp = path.with_extension("tmp");
146 // Create + write + fsync the temp file so its bytes are on disk
147 // before the rename. Then fsync the parent directory so the
148 // rename itself is durable; without this, a crash can leave the
149 // rename unrecorded even though the payload is on disk.
150 {
151 let mut f = std::fs::File::create(&tmp)?;
152 f.write_all(body.as_bytes())?;
153 f.sync_all()?;
154 }
155 std::fs::rename(&tmp, path)?;
156 let dir = std::fs::File::open(parent)?;
157 dir.sync_all()?;
158 Ok(())
159}
160
161/// Cache mapping a `(git blob OID, protocol)` pair to the
162/// content-addressed [`ObjectId`] of the [`FileSchemaObject`]
163/// produced by parsing it.
164///
165/// Keying on the protocol avoids a cross-protocol collision: the
166/// same bytes appearing as `a.py` and `a.txt` parse to different
167/// per-file schemas and therefore must hash to different
168/// [`ObjectId`]s, which means they must occupy different cache
169/// slots.
170///
171/// A [`BlobSchemaCache`] is the key to making incremental tree-based
172/// imports cheap: when a new git commit only changes one file, every
173/// other `(blob, protocol)` pair is already in the cache, so the
174/// importer reuses the existing [`ObjectId`] and only has to rewrite
175/// the tree-node objects on the path from the changed file to the
176/// project root.
177pub type BlobSchemaCache = FxHashMap<(git2::Oid, String), ObjectId>;
178
179/// Result of importing a git repository.
180#[derive(Debug)]
181pub struct ImportResult {
182 /// Number of commits imported.
183 pub commit_count: usize,
184 /// The panproto-vcs object ID of the HEAD commit after import.
185 pub head_id: ObjectId,
186 /// Mapping from git commit OIDs to panproto-vcs object IDs.
187 pub oid_map: Vec<(git2::Oid, ObjectId)>,
188}
189
190/// Import a range of git commits into a panproto-vcs store.
191///
192/// Walks the git commit DAG starting from `revspec` (e.g. "HEAD", "main",
193/// "HEAD~10..HEAD") in topological order. For each commit:
194///
195/// 1. Reads all files from the git tree
196/// 2. Parses them into a project schema via `panproto-project`
197/// 3. Stores the schema as a panproto-vcs object
198/// 4. Creates a panproto-vcs commit preserving author, timestamp, message, parents
199///
200/// This is a convenience wrapper around [`import_git_repo_incremental`] with
201/// an empty `known` map, which re-imports the entire history reachable from
202/// `revspec`. For repeated imports against a persistent store, prefer
203/// [`import_git_repo_incremental`] to avoid walking already-imported ancestors.
204///
205/// # Errors
206///
207/// Returns [`GitBridgeError`] if git operations, parsing, or VCS operations fail.
208pub fn import_git_repo<S: Store>(
209 git_repo: &git2::Repository,
210 panproto_store: &mut S,
211 revspec: &str,
212) -> Result<ImportResult, GitBridgeError> {
213 import_git_repo_incremental(git_repo, panproto_store, revspec, &FxHashMap::default())
214}
215
216/// Import a git repository, persisting the blob-to-schema cache
217/// under `cache_dir/<BLOB_CACHE_FILE>` so subsequent imports
218/// deduplicate unchanged files without re-parsing them.
219///
220/// This is the production entry point: `cache_dir` is usually the
221/// per-remote panproto cache directory
222/// (`$GIT_DIR/panproto-cache/<remote>/`). Pass an empty `known` map
223/// for a full import, or the existing git-to-panproto marks for an
224/// incremental one.
225///
226/// # Errors
227///
228/// Returns [`GitBridgeError`] if git operations, parsing, or VCS
229/// operations fail. The cache file is loaded best-effort; a corrupt
230/// cache propagates as [`GitBridgeError::BlobCache`] so the caller
231/// can choose to delete-and-restart rather than silently re-import.
232pub fn import_git_repo_persistent<S: Store, H: BuildHasher>(
233 git_repo: &git2::Repository,
234 panproto_store: &mut S,
235 revspec: &str,
236 known: &HashMap<git2::Oid, ObjectId, H>,
237 cache_dir: &Path,
238) -> Result<ImportResult, GitBridgeError> {
239 let cache_path = cache_dir.join(BLOB_CACHE_FILE);
240 let mut cache =
241 load_blob_cache(&cache_path).map_err(|e| GitBridgeError::BlobCache(e.to_string()))?;
242 let result = import_git_repo_with_cache(git_repo, panproto_store, revspec, known, &mut cache)?;
243 save_blob_cache(&cache_path, &cache).map_err(|e| GitBridgeError::BlobCache(e.to_string()))?;
244 Ok(result)
245}
246
247/// Incrementally import a range of git commits into a panproto-vcs store.
248///
249/// Like [`import_git_repo`], but skips commits whose git OID appears in
250/// `known`. The `known` map provides the panproto-vcs [`ObjectId`] that
251/// each already-imported git commit was translated to, so that children
252/// of skipped commits can be wired up to the correct panproto parent.
253///
254/// Skipping is performed via `git2`'s revwalk `hide`, so the walker never
255/// visits ancestors of known commits either. This makes repeated imports
256/// against a persistent store run in time proportional to the *new*
257/// commits, not the full history.
258///
259/// # Edge cases
260///
261/// - If `revspec` itself resolves to a commit in `known`, no commits are
262/// imported and [`ImportResult::head_id`] is set from the `known` map.
263/// - If a new commit has a parent that is neither in `known` nor walked
264/// (i.e. the `known` map is inconsistent with the actual DAG), that
265/// parent is dropped from the panproto commit's parents, matching the
266/// behavior of the non-incremental path.
267///
268/// # Errors
269///
270/// Returns [`GitBridgeError`] if git operations, parsing, or VCS operations fail.
271pub fn import_git_repo_incremental<S: Store, H: BuildHasher>(
272 git_repo: &git2::Repository,
273 panproto_store: &mut S,
274 revspec: &str,
275 known: &HashMap<git2::Oid, ObjectId, H>,
276) -> Result<ImportResult, GitBridgeError> {
277 // Delegate to the cache-aware path with an in-memory cache so
278 // every call still gets within-import dedup: two commits that
279 // reference the same git blob share a single FileSchemaObject.
280 // Production callers that want cross-call dedup should go through
281 // [`import_git_repo_persistent`] with a real cache directory.
282 let mut cache = BlobSchemaCache::default();
283 import_git_repo_with_cache(git_repo, panproto_store, revspec, known, &mut cache)
284}
285
286/// Import a git repository using per-file content addressing.
287///
288/// Like [`import_git_repo_incremental`], but stores each commit's
289/// project schema as a Merkle tree of [`FileSchemaObject`] leaves
290/// keyed by git blob OID. Unchanged files reuse their existing
291/// [`FileSchemaObject`] [`ObjectId`] across commits; only the
292/// [`panproto_vcs::SchemaTreeObject`] nodes on the path from the
293/// changed file to the project root are rewritten.
294///
295/// The `blob_cache` is read and updated in place; callers should
296/// persist it across imports (e.g., under
297/// `$GIT_DIR/panproto-cache/<remote>/blob_to_schema`) so repeated
298/// imports only parse blobs that are genuinely new.
299///
300/// # Errors
301///
302/// Returns [`GitBridgeError`] if git operations, parsing, or VCS
303/// operations fail.
304pub fn import_git_repo_with_cache<S, H>(
305 git_repo: &git2::Repository,
306 panproto_store: &mut S,
307 revspec: &str,
308 known: &HashMap<git2::Oid, ObjectId, H>,
309 blob_cache: &mut BlobSchemaCache,
310) -> Result<ImportResult, GitBridgeError>
311where
312 S: Store,
313 H: BuildHasher,
314{
315 let obj = git_repo.revparse_single(revspec)?;
316 let head_commit = obj
317 .peel_to_commit()
318 .map_err(|e| GitBridgeError::ObjectRead {
319 oid: obj.id().to_string(),
320 reason: format!("not a commit: {e}"),
321 })?;
322 let head_git_oid = head_commit.id();
323
324 let mut commits = Vec::new();
325 collect_new_ancestors(git_repo, head_git_oid, known, &mut commits)?;
326
327 let mut git_to_panproto: FxHashMap<git2::Oid, ObjectId> =
328 known.iter().map(|(&k, &v)| (k, v)).collect();
329 let mut oid_map: Vec<(git2::Oid, ObjectId)> = Vec::new();
330 let mut last_id = ObjectId::ZERO;
331
332 for git_oid in &commits {
333 let git_commit = git_repo.find_commit(*git_oid)?;
334 let tree = git_commit.tree()?;
335
336 // Collect (path, FileSchema ObjectId) for every blob under the
337 // git tree, reusing cached IDs where possible.
338 let mut leaves: Vec<(PathBuf, ObjectId)> = Vec::new();
339 collect_tree_leaves(
340 git_repo,
341 &tree,
342 Path::new(""),
343 panproto_store,
344 blob_cache,
345 &mut leaves,
346 )?;
347
348 // Empty trees (initial commit with no files) get a synthetic
349 // single-file leaf so the commit still points at a schema
350 // tree rather than a flat schema.
351 let root_id = if leaves.is_empty() {
352 let proto = panproto_protocols::raw_file::protocol();
353 let schema = panproto_schema::SchemaBuilder::new(&proto)
354 .vertex("root", "file", None)
355 .map_err(|e| {
356 GitBridgeError::Project(panproto_project::ProjectError::CoproductFailed {
357 reason: format!("empty tree schema: {e}"),
358 })
359 })?
360 .build()
361 .map_err(|e| {
362 GitBridgeError::Project(panproto_project::ProjectError::CoproductFailed {
363 reason: format!("empty tree build: {e}"),
364 })
365 })?;
366 let file = FileSchemaObject {
367 path: "__empty__".to_owned(),
368 protocol: "raw_file".to_owned(),
369 schema,
370 cross_file_edges: Vec::new(),
371 };
372 let leaf_id = panproto_store.put(&Object::FileSchema(Box::new(file)))?;
373 panproto_vcs::build_tree_from_leaves(
374 panproto_store,
375 vec![(PathBuf::from("__empty__"), leaf_id)],
376 )
377 .map_err(GitBridgeError::Vcs)?
378 } else {
379 panproto_vcs::build_tree_from_leaves(panproto_store, leaves)
380 .map_err(GitBridgeError::Vcs)?
381 };
382
383 let parents: Vec<ObjectId> = git_commit
384 .parent_ids()
385 .filter_map(|parent_oid| git_to_panproto.get(&parent_oid).copied())
386 .collect();
387
388 let author_sig = git_commit.author();
389 let author = author_sig.name().unwrap_or("unknown").to_owned();
390 let timestamp = u64::try_from(author_sig.when().seconds()).unwrap_or(0);
391 let message = git_commit.message().unwrap_or("(no message)").to_owned();
392
393 let commit = CommitObject::builder(root_id, "project", &author, &message)
394 .parents(parents)
395 .timestamp(timestamp)
396 .build();
397
398 let commit_id = panproto_store.put(&Object::Commit(commit))?;
399
400 git_to_panproto.insert(*git_oid, commit_id);
401 oid_map.push((*git_oid, commit_id));
402 last_id = commit_id;
403 }
404
405 if commits.is_empty() {
406 if let Some(&id) = known.get(&head_git_oid) {
407 last_id = id;
408 }
409 }
410
411 Ok(ImportResult {
412 commit_count: commits.len(),
413 head_id: last_id,
414 oid_map,
415 })
416}
417
418/// Walk a git tree, recording a `(path, FileSchema ObjectId)` leaf
419/// for every blob. Parses and stores blobs whose OIDs are not in
420/// `blob_cache`, and updates the cache with the resulting IDs.
421fn collect_tree_leaves<S: Store>(
422 repo: &git2::Repository,
423 tree: &git2::Tree<'_>,
424 prefix: &Path,
425 store: &mut S,
426 blob_cache: &mut BlobSchemaCache,
427 leaves: &mut Vec<(PathBuf, ObjectId)>,
428) -> Result<(), GitBridgeError> {
429 for entry in tree {
430 let name = entry
431 .name()
432 .ok_or_else(|| GitBridgeError::NonUtf8TreeEntry {
433 parent: prefix.display().to_string(),
434 })?;
435 let path = prefix.join(name);
436
437 match entry.kind() {
438 Some(git2::ObjectType::Blob) => {
439 let blob_oid = entry.id();
440 // Probe the cache before parsing; we need the
441 // protocol to key the cache, so detect it first.
442 let protocol_guess = panproto_project::detect::detect_language(
443 &path,
444 &panproto_parse::ParserRegistry::new(),
445 )
446 .map_or_else(String::new, ToOwned::to_owned);
447 let leaf_id =
448 if let Some(&cached) = blob_cache.get(&(blob_oid, protocol_guess.clone())) {
449 cached
450 } else {
451 let blob = repo.find_blob(blob_oid)?;
452 let content = blob.content();
453 let (schema, protocol) = parse_single_blob(&path, content)?;
454 let file = FileSchemaObject {
455 path: path.display().to_string(),
456 protocol: protocol.clone(),
457 schema,
458 cross_file_edges: Vec::new(),
459 };
460 let id = store.put(&Object::FileSchema(Box::new(file)))?;
461 // Record under the protocol actually used so a
462 // second cache probe for the same (blob, proto)
463 // pair hits even when detection and the parser
464 // disagree (e.g., raw_file fallback).
465 blob_cache.insert((blob_oid, protocol.clone()), id);
466 if protocol != protocol_guess {
467 blob_cache.insert((blob_oid, protocol_guess), id);
468 }
469 id
470 };
471 leaves.push((path, leaf_id));
472 }
473 Some(git2::ObjectType::Tree) => {
474 let subtree = repo.find_tree(entry.id())?;
475 collect_tree_leaves(repo, &subtree, &path, store, blob_cache, leaves)?;
476 }
477 _ => {}
478 }
479 }
480 Ok(())
481}
482
483/// Parse a single git blob into a per-file schema plus the protocol
484/// name used to parse it.
485///
486/// Goes through [`ProjectBuilder`] so the file-to-schema pipeline
487/// matches what the full-repo path does.
488fn parse_single_blob(
489 path: &Path,
490 content: &[u8],
491) -> Result<(panproto_schema::Schema, String), GitBridgeError> {
492 let mut builder = ProjectBuilder::new();
493 builder.add_file(path, content)?;
494 let schemas = builder.file_schemas().clone();
495 let protocols = builder.protocol_map_ref().clone();
496 let schema = schemas.into_iter().next().map(|(_, s)| s).ok_or_else(|| {
497 GitBridgeError::Project(panproto_project::ProjectError::CoproductFailed {
498 reason: "single-blob parse produced no schema".to_owned(),
499 })
500 })?;
501 let protocol = protocols
502 .into_iter()
503 .next()
504 .map_or_else(|| "raw_file".to_owned(), |(_, p)| p);
505 Ok((schema, protocol))
506}
507
508/// Collect ancestor commits in topological order (parents first), skipping
509/// any commit reachable from an entry in `known`.
510fn collect_new_ancestors<H: BuildHasher>(
511 repo: &git2::Repository,
512 head: git2::Oid,
513 known: &HashMap<git2::Oid, ObjectId, H>,
514 result: &mut Vec<git2::Oid>,
515) -> Result<(), GitBridgeError> {
516 let mut revwalk = repo.revwalk()?;
517 revwalk.push(head)?;
518 revwalk.set_sorting(git2::Sort::TOPOLOGICAL | git2::Sort::REVERSE)?;
519
520 // Hide known commits and all their ancestors from the walk.
521 for git_oid in known.keys() {
522 // A known OID may not correspond to a commit reachable from `head`
523 // (e.g. leftover mapping from a deleted branch). `hide` errors in
524 // that case; ignore so an out-of-date map doesn't break imports.
525 let _ = revwalk.hide(*git_oid);
526 }
527
528 for oid_result in revwalk {
529 result.push(oid_result?);
530 }
531
532 Ok(())
533}