cargo_docs_md/source/
collector.rs

1//! Source collector for copying dependency sources to a local directory.
2//!
3//! This module provides functionality to collect dependency source code
4//! from `~/.cargo/registry/src/` into a local `.source_{timestamp}/` directory.
5
6use std::collections::{HashMap, HashSet};
7use std::io::Write;
8use std::path::{Path, PathBuf};
9use std::time::{SystemTime, UNIX_EPOCH};
10use std::{env as StdEnv, fs as StdFs};
11
12use cargo_metadata::{DependencyKind, Metadata, MetadataCommand, PackageId};
13use serde_json as SJSON;
14
15use crate::error::Error;
16
17/// Metadata about a collected crate.
18#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
19pub struct CollectedCrate {
20    /// Crate name.
21    pub name: String,
22
23    /// Crate version.
24    pub version: String,
25
26    /// Rust edition.
27    pub edition: String,
28
29    /// Enabled features.
30    pub features: Vec<String>,
31
32    /// Crate description.
33    pub description: Option<String>,
34
35    /// Relative path within the .source_*/ directory.
36    pub source_path: String,
37}
38
39/// Manifest stored in `.source_*/manifest.json`.
40#[derive(Debug, serde::Serialize, serde::Deserialize)]
41pub struct SourceManifest {
42    /// When the sources were collected.
43    pub collected_at: String,
44
45    /// Workspace root path.
46    pub workspace_root: String,
47
48    /// Collected crates by key "{name}-{version}".
49    pub crates: HashMap<String, CollectedCrate>,
50}
51
52/// Result of a collection operation.
53#[derive(Debug)]
54pub struct CollectionResult {
55    /// Path to the created .source_*/ directory.
56    pub output_dir: PathBuf,
57
58    /// Number of crates collected.
59    pub crates_collected: usize,
60
61    /// Crates that were skipped (not found in registry).
62    pub skipped: Vec<String>,
63}
64
65/// Options for source collection.
66#[derive(Debug, Default)]
67pub struct CollectOptions {
68    /// Include dev-dependencies.
69    pub include_dev: bool,
70
71    /// Custom output directory (overrides timestamp-based naming).
72    pub output: Option<PathBuf>,
73
74    /// Dry run - don't actually copy files.
75    pub dry_run: bool,
76}
77
78/// Collector for gathering dependency sources.
79#[derive(Debug)]
80pub struct SourceCollector {
81    /// Cargo metadata for the workspace.
82    metadata: Metadata,
83
84    /// Path to cargo registry sources.
85    registry_path: PathBuf,
86}
87
88impl SourceCollector {
89    /// Create a new collector for the current directory.
90    ///
91    /// # Errors
92    ///
93    /// Returns an error if cargo metadata cannot be loaded.
94    pub fn new() -> Result<Self, Error> {
95        Self::from_manifest(None)
96    }
97
98    /// Create a new collector from a specific manifest path.
99    ///
100    /// # Errors
101    ///
102    /// Returns an error if cargo metadata cannot be loaded.
103    pub fn from_manifest(manifest_path: Option<&Path>) -> Result<Self, Error> {
104        let mut cmd = MetadataCommand::new();
105
106        if let Some(path) = manifest_path {
107            cmd.manifest_path(path);
108        }
109
110        let metadata = cmd
111            .exec()
112            .map_err(|e| Error::SourceCollector(format!("Failed to load cargo metadata: {e}")))?;
113
114        let home = StdEnv::var("HOME")
115            .or_else(|_| StdEnv::var("USERPROFILE"))
116            .map_err(|_| Error::SourceCollector("Could not determine home directory".into()))?;
117
118        let registry_path = PathBuf::from(home).join(".cargo/registry/src");
119
120        Ok(Self {
121            metadata,
122            registry_path,
123        })
124    }
125
126    /// Collect all dependency sources.
127    ///
128    /// # Errors
129    ///
130    /// Returns an error if collection fails.
131    pub fn collect(&self, options: &CollectOptions) -> Result<CollectionResult, Error> {
132        // Determine output directory
133        let output_dir = match &options.output {
134            Some(path) => path.clone(),
135            None => self.generate_output_dir()?,
136        };
137
138        if options.dry_run {
139            return self.dry_run_collect(&output_dir, options);
140        }
141
142        // Create output directory
143        StdFs::create_dir_all(&output_dir)
144            .map_err(|e| Error::SourceCollector(format!("Failed to create output dir: {e}")))?;
145
146        let mut manifest = SourceManifest {
147            collected_at: TimeUtils::chrono_lite_now(),
148            workspace_root: self.metadata.workspace_root.to_string(),
149            crates: HashMap::new(),
150        };
151
152        let mut skipped = Vec::new();
153        let mut collected_count = 0;
154
155        // Get dev-only packages if we need to filter them out
156        let dev_only = if options.include_dev {
157            HashSet::new()
158        } else {
159            self.get_dev_only_packages()
160        };
161
162        // Collect each external dependency
163        for pkg in &self.metadata.packages {
164            // Skip workspace members
165            if self.metadata.workspace_members.contains(&pkg.id) {
166                continue;
167            }
168
169            // Skip dev-only dependencies if not requested
170            if dev_only.contains(&pkg.id) {
171                continue;
172            }
173
174            let version = pkg.version.to_string();
175            let key = format!("{}-{}", pkg.name, version);
176
177            // Find source in registry
178            match self.find_registry_source(&pkg.name, &version) {
179                Some(source_path) => {
180                    let dest_dir = output_dir.join(&key);
181
182                    // Copy source files
183                    Self::copy_crate_source(&source_path, &dest_dir)?;
184
185                    // Add to manifest
186                    manifest.crates.insert(
187                        key.clone(),
188                        CollectedCrate {
189                            name: pkg.name.to_string(),
190                            version: version.clone(),
191                            edition: pkg.edition.to_string(),
192                            features: pkg.features.keys().cloned().collect(),
193                            description: pkg.description.clone(),
194                            source_path: key,
195                        },
196                    );
197
198                    collected_count += 1;
199                },
200                None => {
201                    skipped.push(format!("{}-{}", pkg.name, version));
202                },
203            }
204        }
205
206        // Write manifest.json
207        let manifest_path = output_dir.join("manifest.json");
208        let manifest_json = SJSON::to_string_pretty(&manifest)
209            .map_err(|e| Error::SourceCollector(format!("Failed to serialize manifest: {e}")))?;
210        StdFs::write(&manifest_path, manifest_json)
211            .map_err(|e| Error::SourceCollector(format!("Failed to write manifest: {e}")))?;
212
213        // Update .gitignore
214        self.update_gitignore()?;
215
216        Ok(CollectionResult {
217            output_dir,
218            crates_collected: collected_count,
219            skipped,
220        })
221    }
222
223    /// Generate a timestamp-based output directory name.
224    fn generate_output_dir(&self) -> Result<PathBuf, Error> {
225        let workspace_root = self.metadata.workspace_root.as_std_path();
226        let timestamp = SystemTime::now()
227            .duration_since(UNIX_EPOCH)
228            .map_err(|e| Error::SourceCollector(format!("Failed to get timestamp: {e}")))?
229            .as_secs();
230
231        // Try up to 3 times with incrementing timestamp
232        for i in 0..3 {
233            let dir_name = format!(".source_{}", timestamp + i);
234            let path = workspace_root.join(&dir_name);
235
236            if !path.exists() {
237                return Ok(path);
238            }
239        }
240
241        Err(Error::SourceCollector(
242            "Too many .source_* directories exist. Please clean up old ones.".into(),
243        ))
244    }
245
246    /// Find a crate's source in the cargo registry.
247    fn find_registry_source(&self, name: &str, version: &str) -> Option<PathBuf> {
248        if !self.registry_path.exists() {
249            return None;
250        }
251
252        let target_dir = format!("{name}-{version}");
253
254        // Scan registry index directories
255        for entry in StdFs::read_dir(&self.registry_path).ok()? {
256            let entry = entry.ok()?;
257            let index_path = entry.path();
258
259            if index_path.is_dir() {
260                let crate_path = index_path.join(&target_dir);
261
262                if crate_path.exists() && crate_path.is_dir() {
263                    return Some(crate_path);
264                }
265            }
266        }
267
268        None
269    }
270
271    /// Copy crate source to destination.
272    fn copy_crate_source(source: &Path, dest: &Path) -> Result<(), Error> {
273        StdFs::create_dir_all(dest)
274            .map_err(|e| Error::SourceCollector(format!("Failed to create dir: {e}")))?;
275
276        // Copy src/ directory
277        let src_dir = source.join("src");
278
279        if src_dir.exists() {
280            Self::copy_dir_recursive(&src_dir, &dest.join("src"))?;
281        }
282
283        // Copy and rename Cargo.toml to Crate.toml
284        let cargo_toml = source.join("Cargo.toml");
285        if cargo_toml.exists() {
286            StdFs::copy(&cargo_toml, dest.join("Crate.toml"))
287                .map_err(|e| Error::SourceCollector(format!("Failed to copy Cargo.toml: {e}")))?;
288        }
289
290        Ok(())
291    }
292
293    /// Get the set of package IDs that are dev-only dependencies.
294    ///
295    /// A package is considered dev-only if it is only reachable from workspace
296    /// members via dev-dependencies (not normal or build dependencies).
297    fn get_dev_only_packages(&self) -> HashSet<PackageId> {
298        let Some(resolve) = &self.metadata.resolve else {
299            return HashSet::new();
300        };
301
302        // Build a map of package ID to its node for quick lookup
303        let nodes: HashMap<&PackageId, _> =
304            resolve.nodes.iter().map(|node| (&node.id, node)).collect();
305
306        // Collect all packages reachable via non-dev dependencies from workspace members
307        let mut non_dev_reachable: HashSet<PackageId> = HashSet::new();
308        let mut to_visit: Vec<&PackageId> = self.metadata.workspace_members.iter().collect();
309
310        while let Some(pkg_id) = to_visit.pop() {
311            if let Some(node) = nodes.get(pkg_id) {
312                for dep in &node.deps {
313                    // Check if this dependency has any non-dev dependency kinds
314                    let has_non_dev = dep
315                        .dep_kinds
316                        .iter()
317                        .any(|dk| !matches!(dk.kind, DependencyKind::Development));
318
319                    if has_non_dev && non_dev_reachable.insert(dep.pkg.clone()) {
320                        to_visit.push(&dep.pkg);
321                    }
322                }
323            }
324        }
325
326        // Dev-only packages are those in metadata.packages but NOT in non_dev_reachable
327        // (excluding workspace members themselves)
328        self.metadata
329            .packages
330            .iter()
331            .filter(|pkg| {
332                !self.metadata.workspace_members.contains(&pkg.id)
333                    && !non_dev_reachable.contains(&pkg.id)
334            })
335            .map(|pkg| pkg.id.clone())
336            .collect()
337    }
338
339    /// Perform a dry run, returning what would be collected.
340    #[expect(clippy::unnecessary_wraps, reason = "Not really")]
341    fn dry_run_collect(
342        &self,
343        output_dir: &Path,
344        options: &CollectOptions,
345    ) -> Result<CollectionResult, Error> {
346        let mut skipped = Vec::new();
347        let mut collected_count = 0;
348
349        // Get dev-only packages if we need to filter them out
350        let dev_only = if options.include_dev {
351            HashSet::new()
352        } else {
353            self.get_dev_only_packages()
354        };
355
356        for pkg in &self.metadata.packages {
357            if self.metadata.workspace_members.contains(&pkg.id) {
358                continue;
359            }
360
361            // Skip dev-only dependencies if not requested
362            if dev_only.contains(&pkg.id) {
363                continue;
364            }
365
366            let version = pkg.version.to_string();
367
368            if self.find_registry_source(&pkg.name, &version).is_some() {
369                collected_count += 1;
370            } else {
371                skipped.push(format!("{}-{}", pkg.name, version));
372            }
373        }
374
375        Ok(CollectionResult {
376            output_dir: output_dir.to_path_buf(),
377            crates_collected: collected_count,
378            skipped,
379        })
380    }
381
382    /// Update .gitignore to include .source_* pattern.
383    fn update_gitignore(&self) -> Result<(), Error> {
384        let gitignore_path = self.metadata.workspace_root.join(".gitignore");
385        let pattern = ".source_*";
386
387        // Read existing content
388        let content = StdFs::read_to_string(&gitignore_path).unwrap_or_default();
389
390        // Check if pattern already exists
391        if content.lines().any(|line| line.trim() == pattern) {
392            return Ok(());
393        }
394
395        // Append pattern
396        let mut file = StdFs::OpenOptions::new()
397            .create(true)
398            .append(true)
399            .open(&gitignore_path)
400            .map_err(|e| Error::SourceCollector(format!("Failed to open .gitignore: {e}")))?;
401
402        // Add newline if file doesn't end with one
403        if !content.is_empty() && !content.ends_with('\n') {
404            writeln!(file).map_err(|e| {
405                Error::SourceCollector(format!("Failed to write to .gitignore: {e}"))
406            })?;
407        }
408
409        writeln!(file, "{pattern}")
410            .map_err(|e| Error::SourceCollector(format!("Failed to write to .gitignore: {e}")))?;
411
412        Ok(())
413    }
414
415    /// List all external dependencies.
416    #[must_use]
417    pub fn list_dependencies(&self) -> Vec<(&str, &str)> {
418        self.metadata
419            .packages
420            .iter()
421            .filter(|pkg| !self.metadata.workspace_members.contains(&pkg.id))
422            .map(|pkg| (pkg.name.as_str(), pkg.version.to_string().leak() as &str))
423            .collect()
424    }
425
426    /// Recursively copy a directory.
427    fn copy_dir_recursive(src: &Path, dest: &Path) -> Result<(), Error> {
428        StdFs::create_dir_all(dest).map_err(|e| {
429            Error::SourceCollector(format!("Failed to create dir {}: {e}", dest.display()))
430        })?;
431
432        for entry in StdFs::read_dir(src).map_err(|e| {
433            Error::SourceCollector(format!("Failed to read dir {}: {e}", src.display()))
434        })? {
435            let entry =
436                entry.map_err(|e| Error::SourceCollector(format!("Failed to read entry: {e}")))?;
437            let path = entry.path();
438            let dest_path = dest.join(entry.file_name());
439
440            if path.is_dir() {
441                Self::copy_dir_recursive(&path, &dest_path)?;
442            } else {
443                StdFs::copy(&path, &dest_path).map_err(|e| {
444                    Error::SourceCollector(format!(
445                        "Failed to copy {} to {}: {e}",
446                        path.display(),
447                        dest_path.display()
448                    ))
449                })?;
450            }
451        }
452
453        Ok(())
454    }
455}
456
457struct TimeUtils;
458
459impl TimeUtils {
460    /// Simple ISO 8601 timestamp without external dependency.
461    fn chrono_lite_now() -> String {
462        let duration = SystemTime::now()
463            .duration_since(UNIX_EPOCH)
464            .unwrap_or_default();
465        let secs = duration.as_secs();
466
467        // Convert to approximate ISO 8601 (good enough for our purposes)
468        // This is a simplified version - not handling leap seconds etc.
469        let days_since_epoch = secs / 86400;
470        let time_of_day = secs % 86400;
471
472        let hours = time_of_day / 3600;
473        let minutes = (time_of_day % 3600) / 60;
474        let seconds = time_of_day % 60;
475
476        // Approximate date calculation (doesn't account for leap years perfectly)
477        let mut year = 1970;
478        let mut remaining_days = days_since_epoch;
479
480        loop {
481            let days_in_year = if Self::is_leap_year(year) { 366 } else { 365 };
482
483            if remaining_days < days_in_year {
484                break;
485            }
486
487            remaining_days -= days_in_year;
488            year += 1;
489        }
490
491        let mut month = 1;
492        let days_in_months = if Self::is_leap_year(year) {
493            [31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
494        } else {
495            [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
496        };
497
498        for days in days_in_months {
499            if remaining_days < days {
500                break;
501            }
502            remaining_days -= days;
503            month += 1;
504        }
505
506        let day = remaining_days + 1;
507
508        format!("{year:04}-{month:02}-{day:02}T{hours:02}:{minutes:02}:{seconds:02}Z")
509    }
510
511    const fn is_leap_year(year: u64) -> bool {
512        (year.is_multiple_of(4) && !year.is_multiple_of(100)) || year.is_multiple_of(400)
513    }
514}
515
516#[cfg(test)]
517mod tests {
518    use super::{SourceCollector, TimeUtils};
519
520    #[test]
521    fn test_chrono_lite_now() {
522        let ts = TimeUtils::chrono_lite_now();
523        // Should be in ISO 8601 format
524        assert!(ts.contains('T'));
525        assert!(ts.ends_with('Z'));
526        assert!(ts.starts_with("20")); // 2000s
527    }
528
529    #[test]
530    fn test_is_leap_year() {
531        assert!(TimeUtils::is_leap_year(2000));
532        assert!(TimeUtils::is_leap_year(2024));
533        assert!(!TimeUtils::is_leap_year(1900));
534        assert!(!TimeUtils::is_leap_year(2023));
535    }
536
537    #[test]
538    fn test_get_dev_only_packages_detects_dev_deps() {
539        // This test runs on the actual cargo-docs-md project
540        let collector = SourceCollector::new().expect("Failed to create collector");
541        let dev_only = collector.get_dev_only_packages();
542
543        // Convert to package names for easier assertion
544        let dev_only_names: Vec<&str> = collector
545            .metadata
546            .packages
547            .iter()
548            .filter(|pkg| dev_only.contains(&pkg.id))
549            .map(|pkg| pkg.name.as_str())
550            .collect();
551
552        // insta and divan are dev-only dependencies
553        assert!(
554            dev_only_names.contains(&"insta"),
555            "insta should be detected as dev-only, got: {dev_only_names:?}"
556        );
557        assert!(
558            dev_only_names.contains(&"divan"),
559            "divan should be detected as dev-only, got: {dev_only_names:?}"
560        );
561    }
562
563    #[test]
564    fn test_get_dev_only_packages_excludes_normal_deps() {
565        let collector = SourceCollector::new().expect("Failed to create collector");
566        let dev_only = collector.get_dev_only_packages();
567
568        // Convert to package names for easier assertion
569        let dev_only_names: Vec<&str> = collector
570            .metadata
571            .packages
572            .iter()
573            .filter(|pkg| dev_only.contains(&pkg.id))
574            .map(|pkg| pkg.name.as_str())
575            .collect();
576
577        // Normal dependencies should NOT be in dev-only
578        assert!(
579            !dev_only_names.contains(&"serde"),
580            "serde should NOT be dev-only"
581        );
582        assert!(
583            !dev_only_names.contains(&"clap"),
584            "clap should NOT be dev-only"
585        );
586        assert!(
587            !dev_only_names.contains(&"syn"),
588            "syn should NOT be dev-only"
589        );
590        // tracing is in both deps and dev-deps, but since it's a normal dep it shouldn't be dev-only
591        assert!(
592            !dev_only_names.contains(&"tracing"),
593            "tracing should NOT be dev-only (it's also a normal dependency)"
594        );
595    }
596
597    #[test]
598    fn test_get_dev_only_packages_with_no_resolve() {
599        // When there's no resolve graph, should return empty set
600        let mut collector = SourceCollector::new().expect("Failed to create collector");
601
602        // Clear the resolve to simulate metadata without resolve
603        collector.metadata.resolve = None;
604
605        let dev_only = collector.get_dev_only_packages();
606        assert!(
607            dev_only.is_empty(),
608            "Should return empty set when no resolve graph"
609        );
610    }
611
612    #[test]
613    fn test_list_dependencies_excludes_workspace_members() {
614        let collector = SourceCollector::new().expect("Failed to create collector");
615        let deps = collector.list_dependencies();
616
617        // Should not include the workspace member (cargo-docs-md itself)
618        let dep_names: Vec<&str> = deps.iter().map(|(name, _)| *name).collect();
619        assert!(
620            !dep_names.contains(&"cargo-docs-md"),
621            "Should not include workspace member"
622        );
623
624        // Should include actual dependencies
625        assert!(
626            dep_names.contains(&"serde"),
627            "Should include serde dependency"
628        );
629    }
630}