Skip to main content

panproto_git/
import.rs

1//! Import git repositories into panproto-vcs.
2//!
3//! Walks the git commit DAG topologically, parses each commit's file tree
4//! into a panproto project schema, and creates panproto-vcs commits that
5//! preserve authorship, timestamps, and parent structure.
6
7use std::collections::HashMap;
8use std::hash::BuildHasher;
9use std::path::{Path, PathBuf};
10
11use panproto_project::ProjectBuilder;
12use panproto_vcs::{CommitObject, FileSchemaObject, Object, ObjectId, Store};
13use rustc_hash::FxHashMap;
14
15use crate::error::GitBridgeError;
16
17/// Standard on-disk name of the blob-OID to `FileSchema`
18/// [`ObjectId`] cache.
19pub const BLOB_CACHE_FILE: &str = "blob_to_schema";
20
21/// Error loading a blob-to-schema cache.
22#[derive(Debug, thiserror::Error)]
23pub enum BlobCacheLoadError {
24    /// The cache file exists but could not be parsed.
25    #[error("blob cache at {path} is corrupt at line {line}: {reason}")]
26    Corrupt {
27        /// Display of the cache file path.
28        path: String,
29        /// 1-based line number of the first malformed entry.
30        line: usize,
31        /// What went wrong on that line.
32        reason: String,
33    },
34
35    /// An I/O error occurred while reading the cache.
36    #[error("blob cache at {path}: {source}")]
37    Io {
38        /// Display of the cache file path.
39        path: String,
40        /// The underlying I/O error.
41        #[source]
42        source: std::io::Error,
43    },
44}
45
46/// Load a blob-to-schema cache from a plain-text file.
47///
48/// File format: one entry per line,
49/// `<git_blob_oid> <protocol_name> <file_schema_panproto_id>`. Every
50/// entry carries a non-empty protocol slot; a line with a missing or
51/// empty protocol is rejected as corrupt so the caller can delete the
52/// file and reimport rather than round-trip through an empty slot.
53///
54/// # Errors
55///
56/// Returns [`BlobCacheLoadError::Io`] for I/O problems other than a
57/// missing file (missing yields an empty cache), and
58/// [`BlobCacheLoadError::Corrupt`] if any line cannot be parsed.
59pub fn load_blob_cache(path: &Path) -> Result<BlobSchemaCache, BlobCacheLoadError> {
60    let content = match std::fs::read_to_string(path) {
61        Ok(c) => c,
62        Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
63            return Ok(BlobSchemaCache::default());
64        }
65        Err(source) => {
66            return Err(BlobCacheLoadError::Io {
67                path: path.display().to_string(),
68                source,
69            });
70        }
71    };
72    let mut map = BlobSchemaCache::default();
73    for (idx, line) in content.lines().enumerate() {
74        if line.trim().is_empty() {
75            continue;
76        }
77        let mut parts = line.split_whitespace();
78        let Some(blob_hex) = parts.next() else {
79            continue;
80        };
81        let Some(protocol) = parts.next() else {
82            return Err(BlobCacheLoadError::Corrupt {
83                path: path.display().to_string(),
84                line: idx + 1,
85                reason: "missing protocol slot; delete the cache file and reimport".to_owned(),
86            });
87        };
88        let Some(panproto_hex) = parts.next() else {
89            return Err(BlobCacheLoadError::Corrupt {
90                path: path.display().to_string(),
91                line: idx + 1,
92                reason: "missing panproto id".to_owned(),
93            });
94        };
95        let blob_oid = git2::Oid::from_str(blob_hex).map_err(|e| BlobCacheLoadError::Corrupt {
96            path: path.display().to_string(),
97            line: idx + 1,
98            reason: format!("bad git oid: {e}"),
99        })?;
100        let panproto_id =
101            panproto_hex
102                .parse::<ObjectId>()
103                .map_err(|e| BlobCacheLoadError::Corrupt {
104                    path: path.display().to_string(),
105                    line: idx + 1,
106                    reason: format!("bad panproto id: {e}"),
107                })?;
108        map.insert((blob_oid, protocol.to_owned()), panproto_id);
109    }
110    Ok(map)
111}
112
113/// Persist a blob-to-schema cache atomically.
114///
115/// Writes to `<path>.tmp` and renames into place, so a crash mid-write
116/// cannot leave a partial file that would later parse as corrupt.
117///
118/// # Errors
119///
120/// Returns any I/O error encountered while creating parent
121/// directories, writing, or renaming.
122pub fn save_blob_cache(path: &Path, cache: &BlobSchemaCache) -> std::io::Result<()> {
123    use std::io::Write;
124    let parent = path.parent().ok_or_else(|| {
125        std::io::Error::new(
126            std::io::ErrorKind::InvalidInput,
127            "blob cache path has no parent directory",
128        )
129    })?;
130    std::fs::create_dir_all(parent)?;
131    let mut lines: Vec<String> = Vec::with_capacity(cache.len());
132    for ((blob, protocol), id) in cache {
133        if protocol.is_empty() {
134            return Err(std::io::Error::new(
135                std::io::ErrorKind::InvalidInput,
136                format!(
137                    "blob cache entry for {blob} has empty protocol; every entry must carry a protocol name"
138                ),
139            ));
140        }
141        lines.push(format!("{blob} {protocol} {id}"));
142    }
143    lines.sort();
144    let body = lines.join("\n") + "\n";
145    let tmp = path.with_extension("tmp");
146    // Create + write + fsync the temp file so its bytes are on disk
147    // before the rename. Then fsync the parent directory so the
148    // rename itself is durable; without this, a crash can leave the
149    // rename unrecorded even though the payload is on disk.
150    {
151        let mut f = std::fs::File::create(&tmp)?;
152        f.write_all(body.as_bytes())?;
153        f.sync_all()?;
154    }
155    std::fs::rename(&tmp, path)?;
156    let dir = std::fs::File::open(parent)?;
157    dir.sync_all()?;
158    Ok(())
159}
160
161/// Cache mapping a `(git blob OID, protocol)` pair to the
162/// content-addressed [`ObjectId`] of the [`FileSchemaObject`]
163/// produced by parsing it.
164///
165/// Keying on the protocol avoids a cross-protocol collision: the
166/// same bytes appearing as `a.py` and `a.txt` parse to different
167/// per-file schemas and therefore must hash to different
168/// [`ObjectId`]s, which means they must occupy different cache
169/// slots.
170///
171/// A [`BlobSchemaCache`] is the key to making incremental tree-based
172/// imports cheap: when a new git commit only changes one file, every
173/// other `(blob, protocol)` pair is already in the cache, so the
174/// importer reuses the existing [`ObjectId`] and only has to rewrite
175/// the tree-node objects on the path from the changed file to the
176/// project root.
177pub type BlobSchemaCache = FxHashMap<(git2::Oid, String), ObjectId>;
178
179/// Result of importing a git repository.
180#[derive(Debug)]
181pub struct ImportResult {
182    /// Number of commits imported.
183    pub commit_count: usize,
184    /// The panproto-vcs object ID of the HEAD commit after import.
185    pub head_id: ObjectId,
186    /// Mapping from git commit OIDs to panproto-vcs object IDs.
187    pub oid_map: Vec<(git2::Oid, ObjectId)>,
188}
189
190/// Import a range of git commits into a panproto-vcs store.
191///
192/// Walks the git commit DAG starting from `revspec` (e.g. "HEAD", "main",
193/// "HEAD~10..HEAD") in topological order. For each commit:
194///
195/// 1. Reads all files from the git tree
196/// 2. Parses them into a project schema via `panproto-project`
197/// 3. Stores the schema as a panproto-vcs object
198/// 4. Creates a panproto-vcs commit preserving author, timestamp, message, parents
199///
200/// This is a convenience wrapper around [`import_git_repo_incremental`] with
201/// an empty `known` map, which re-imports the entire history reachable from
202/// `revspec`. For repeated imports against a persistent store, prefer
203/// [`import_git_repo_incremental`] to avoid walking already-imported ancestors.
204///
205/// # Errors
206///
207/// Returns [`GitBridgeError`] if git operations, parsing, or VCS operations fail.
208pub fn import_git_repo<S: Store>(
209    git_repo: &git2::Repository,
210    panproto_store: &mut S,
211    revspec: &str,
212) -> Result<ImportResult, GitBridgeError> {
213    import_git_repo_incremental(git_repo, panproto_store, revspec, &FxHashMap::default())
214}
215
216/// Import a git repository, persisting the blob-to-schema cache
217/// under `cache_dir/<BLOB_CACHE_FILE>` so subsequent imports
218/// deduplicate unchanged files without re-parsing them.
219///
220/// This is the production entry point: `cache_dir` is usually the
221/// per-remote panproto cache directory
222/// (`$GIT_DIR/panproto-cache/<remote>/`). Pass an empty `known` map
223/// for a full import, or the existing git-to-panproto marks for an
224/// incremental one.
225///
226/// # Errors
227///
228/// Returns [`GitBridgeError`] if git operations, parsing, or VCS
229/// operations fail. The cache file is loaded best-effort; a corrupt
230/// cache propagates as [`GitBridgeError::BlobCache`] so the caller
231/// can choose to delete-and-restart rather than silently re-import.
232pub fn import_git_repo_persistent<S: Store, H: BuildHasher>(
233    git_repo: &git2::Repository,
234    panproto_store: &mut S,
235    revspec: &str,
236    known: &HashMap<git2::Oid, ObjectId, H>,
237    cache_dir: &Path,
238) -> Result<ImportResult, GitBridgeError> {
239    let cache_path = cache_dir.join(BLOB_CACHE_FILE);
240    let mut cache =
241        load_blob_cache(&cache_path).map_err(|e| GitBridgeError::BlobCache(e.to_string()))?;
242    let result = import_git_repo_with_cache(git_repo, panproto_store, revspec, known, &mut cache)?;
243    save_blob_cache(&cache_path, &cache).map_err(|e| GitBridgeError::BlobCache(e.to_string()))?;
244    Ok(result)
245}
246
247/// Incrementally import a range of git commits into a panproto-vcs store.
248///
249/// Like [`import_git_repo`], but skips commits whose git OID appears in
250/// `known`. The `known` map provides the panproto-vcs [`ObjectId`] that
251/// each already-imported git commit was translated to, so that children
252/// of skipped commits can be wired up to the correct panproto parent.
253///
254/// Skipping is performed via `git2`'s revwalk `hide`, so the walker never
255/// visits ancestors of known commits either. This makes repeated imports
256/// against a persistent store run in time proportional to the *new*
257/// commits, not the full history.
258///
259/// # Edge cases
260///
261/// - If `revspec` itself resolves to a commit in `known`, no commits are
262///   imported and [`ImportResult::head_id`] is set from the `known` map.
263/// - If a new commit has a parent that is neither in `known` nor walked
264///   (i.e. the `known` map is inconsistent with the actual DAG), that
265///   parent is dropped from the panproto commit's parents, matching the
266///   behavior of the non-incremental path.
267///
268/// # Errors
269///
270/// Returns [`GitBridgeError`] if git operations, parsing, or VCS operations fail.
271pub fn import_git_repo_incremental<S: Store, H: BuildHasher>(
272    git_repo: &git2::Repository,
273    panproto_store: &mut S,
274    revspec: &str,
275    known: &HashMap<git2::Oid, ObjectId, H>,
276) -> Result<ImportResult, GitBridgeError> {
277    // Delegate to the cache-aware path with an in-memory cache so
278    // every call still gets within-import dedup: two commits that
279    // reference the same git blob share a single FileSchemaObject.
280    // Production callers that want cross-call dedup should go through
281    // [`import_git_repo_persistent`] with a real cache directory.
282    let mut cache = BlobSchemaCache::default();
283    import_git_repo_with_cache(git_repo, panproto_store, revspec, known, &mut cache)
284}
285
286/// Import a git repository using per-file content addressing.
287///
288/// Like [`import_git_repo_incremental`], but stores each commit's
289/// project schema as a Merkle tree of [`FileSchemaObject`] leaves
290/// keyed by git blob OID. Unchanged files reuse their existing
291/// [`FileSchemaObject`] [`ObjectId`] across commits; only the
292/// [`panproto_vcs::SchemaTreeObject`] nodes on the path from the
293/// changed file to the project root are rewritten.
294///
295/// The `blob_cache` is read and updated in place; callers should
296/// persist it across imports (e.g., under
297/// `$GIT_DIR/panproto-cache/<remote>/blob_to_schema`) so repeated
298/// imports only parse blobs that are genuinely new.
299///
300/// # Errors
301///
302/// Returns [`GitBridgeError`] if git operations, parsing, or VCS
303/// operations fail.
304pub fn import_git_repo_with_cache<S, H>(
305    git_repo: &git2::Repository,
306    panproto_store: &mut S,
307    revspec: &str,
308    known: &HashMap<git2::Oid, ObjectId, H>,
309    blob_cache: &mut BlobSchemaCache,
310) -> Result<ImportResult, GitBridgeError>
311where
312    S: Store,
313    H: BuildHasher,
314{
315    let obj = git_repo.revparse_single(revspec)?;
316    let head_commit = obj
317        .peel_to_commit()
318        .map_err(|e| GitBridgeError::ObjectRead {
319            oid: obj.id().to_string(),
320            reason: format!("not a commit: {e}"),
321        })?;
322    let head_git_oid = head_commit.id();
323
324    let mut commits = Vec::new();
325    collect_new_ancestors(git_repo, head_git_oid, known, &mut commits)?;
326
327    let mut git_to_panproto: FxHashMap<git2::Oid, ObjectId> =
328        known.iter().map(|(&k, &v)| (k, v)).collect();
329    let mut oid_map: Vec<(git2::Oid, ObjectId)> = Vec::new();
330    let mut last_id = ObjectId::ZERO;
331
332    for git_oid in &commits {
333        let git_commit = git_repo.find_commit(*git_oid)?;
334        let tree = git_commit.tree()?;
335
336        // Collect (path, FileSchema ObjectId) for every blob under the
337        // git tree, reusing cached IDs where possible.
338        let mut leaves: Vec<(PathBuf, ObjectId)> = Vec::new();
339        collect_tree_leaves(
340            git_repo,
341            &tree,
342            Path::new(""),
343            panproto_store,
344            blob_cache,
345            &mut leaves,
346        )?;
347
348        // Empty trees (initial commit with no files) get a synthetic
349        // single-file leaf so the commit still points at a schema
350        // tree rather than a flat schema.
351        let root_id = if leaves.is_empty() {
352            let proto = panproto_protocols::raw_file::protocol();
353            let schema = panproto_schema::SchemaBuilder::new(&proto)
354                .vertex("root", "file", None)
355                .map_err(|e| {
356                    GitBridgeError::Project(panproto_project::ProjectError::CoproductFailed {
357                        reason: format!("empty tree schema: {e}"),
358                    })
359                })?
360                .build()
361                .map_err(|e| {
362                    GitBridgeError::Project(panproto_project::ProjectError::CoproductFailed {
363                        reason: format!("empty tree build: {e}"),
364                    })
365                })?;
366            let file = FileSchemaObject {
367                path: "__empty__".to_owned(),
368                protocol: "raw_file".to_owned(),
369                schema,
370                cross_file_edges: Vec::new(),
371            };
372            let leaf_id = panproto_store.put(&Object::FileSchema(Box::new(file)))?;
373            panproto_vcs::build_tree_from_leaves(
374                panproto_store,
375                vec![(PathBuf::from("__empty__"), leaf_id)],
376            )
377            .map_err(GitBridgeError::Vcs)?
378        } else {
379            panproto_vcs::build_tree_from_leaves(panproto_store, leaves)
380                .map_err(GitBridgeError::Vcs)?
381        };
382
383        let parents: Vec<ObjectId> = git_commit
384            .parent_ids()
385            .filter_map(|parent_oid| git_to_panproto.get(&parent_oid).copied())
386            .collect();
387
388        let author_sig = git_commit.author();
389        let author = author_sig.name().unwrap_or("unknown").to_owned();
390        let timestamp = u64::try_from(author_sig.when().seconds()).unwrap_or(0);
391        let message = git_commit.message().unwrap_or("(no message)").to_owned();
392
393        let commit = CommitObject::builder(root_id, "project", &author, &message)
394            .parents(parents)
395            .timestamp(timestamp)
396            .build();
397
398        let commit_id = panproto_store.put(&Object::Commit(commit))?;
399
400        git_to_panproto.insert(*git_oid, commit_id);
401        oid_map.push((*git_oid, commit_id));
402        last_id = commit_id;
403    }
404
405    if commits.is_empty() {
406        if let Some(&id) = known.get(&head_git_oid) {
407            last_id = id;
408        }
409    }
410
411    Ok(ImportResult {
412        commit_count: commits.len(),
413        head_id: last_id,
414        oid_map,
415    })
416}
417
418/// Walk a git tree, recording a `(path, FileSchema ObjectId)` leaf
419/// for every blob. Parses and stores blobs whose OIDs are not in
420/// `blob_cache`, and updates the cache with the resulting IDs.
421fn collect_tree_leaves<S: Store>(
422    repo: &git2::Repository,
423    tree: &git2::Tree<'_>,
424    prefix: &Path,
425    store: &mut S,
426    blob_cache: &mut BlobSchemaCache,
427    leaves: &mut Vec<(PathBuf, ObjectId)>,
428) -> Result<(), GitBridgeError> {
429    for entry in tree {
430        let name = entry
431            .name()
432            .ok_or_else(|| GitBridgeError::NonUtf8TreeEntry {
433                parent: prefix.display().to_string(),
434            })?;
435        let path = prefix.join(name);
436
437        match entry.kind() {
438            Some(git2::ObjectType::Blob) => {
439                let blob_oid = entry.id();
440                // Probe the cache before parsing; we need the
441                // protocol to key the cache, so detect it first.
442                let protocol_guess = panproto_project::detect::detect_language(
443                    &path,
444                    &panproto_parse::ParserRegistry::new(),
445                )
446                .map_or_else(String::new, ToOwned::to_owned);
447                let leaf_id =
448                    if let Some(&cached) = blob_cache.get(&(blob_oid, protocol_guess.clone())) {
449                        cached
450                    } else {
451                        let blob = repo.find_blob(blob_oid)?;
452                        let content = blob.content();
453                        let (schema, protocol) = parse_single_blob(&path, content)?;
454                        let file = FileSchemaObject {
455                            path: path.display().to_string(),
456                            protocol: protocol.clone(),
457                            schema,
458                            cross_file_edges: Vec::new(),
459                        };
460                        let id = store.put(&Object::FileSchema(Box::new(file)))?;
461                        // Record under the protocol actually used so a
462                        // second cache probe for the same (blob, proto)
463                        // pair hits even when detection and the parser
464                        // disagree (e.g., raw_file fallback).
465                        blob_cache.insert((blob_oid, protocol.clone()), id);
466                        if protocol != protocol_guess {
467                            blob_cache.insert((blob_oid, protocol_guess), id);
468                        }
469                        id
470                    };
471                leaves.push((path, leaf_id));
472            }
473            Some(git2::ObjectType::Tree) => {
474                let subtree = repo.find_tree(entry.id())?;
475                collect_tree_leaves(repo, &subtree, &path, store, blob_cache, leaves)?;
476            }
477            _ => {}
478        }
479    }
480    Ok(())
481}
482
483/// Parse a single git blob into a per-file schema plus the protocol
484/// name used to parse it.
485///
486/// Goes through [`ProjectBuilder`] so the file-to-schema pipeline
487/// matches what the full-repo path does.
488fn parse_single_blob(
489    path: &Path,
490    content: &[u8],
491) -> Result<(panproto_schema::Schema, String), GitBridgeError> {
492    let mut builder = ProjectBuilder::new();
493    builder.add_file(path, content)?;
494    let schemas = builder.file_schemas().clone();
495    let protocols = builder.protocol_map_ref().clone();
496    let schema = schemas.into_iter().next().map(|(_, s)| s).ok_or_else(|| {
497        GitBridgeError::Project(panproto_project::ProjectError::CoproductFailed {
498            reason: "single-blob parse produced no schema".to_owned(),
499        })
500    })?;
501    let protocol = protocols
502        .into_iter()
503        .next()
504        .map_or_else(|| "raw_file".to_owned(), |(_, p)| p);
505    Ok((schema, protocol))
506}
507
508/// Collect ancestor commits in topological order (parents first), skipping
509/// any commit reachable from an entry in `known`.
510fn collect_new_ancestors<H: BuildHasher>(
511    repo: &git2::Repository,
512    head: git2::Oid,
513    known: &HashMap<git2::Oid, ObjectId, H>,
514    result: &mut Vec<git2::Oid>,
515) -> Result<(), GitBridgeError> {
516    let mut revwalk = repo.revwalk()?;
517    revwalk.push(head)?;
518    revwalk.set_sorting(git2::Sort::TOPOLOGICAL | git2::Sort::REVERSE)?;
519
520    // Hide known commits and all their ancestors from the walk.
521    for git_oid in known.keys() {
522        // A known OID may not correspond to a commit reachable from `head`
523        // (e.g. leftover mapping from a deleted branch). `hide` errors in
524        // that case; ignore so an out-of-date map doesn't break imports.
525        let _ = revwalk.hide(*git_oid);
526    }
527
528    for oid_result in revwalk {
529        result.push(oid_result?);
530    }
531
532    Ok(())
533}