gh_docs_download/
downloader.rs

1//! File downloader using git clone approach.
2//!
3//! This module provides the core downloading functionality using git clone
4//! to access repository contents locally.
5
6use crate::error::{GitHubDocsError, Result};
7use crate::types::{DocsDirectory, DocumentationFile, FilePath, RepoSpec};
8use std::path::Path;
9use std::process::Command;
10use tempfile::TempDir;
11use walkdir::WalkDir;
12
13/// Configuration for the documentation downloader.
14#[derive(Debug, Clone)]
15pub struct DownloadConfig {
16    /// Output directory for downloaded files
17    pub output_dir: String,
18    /// Whether to only list files without downloading
19    pub list_only: bool,
20    /// Whether to include subdirectories recursively
21    pub recursive: bool,
22    /// Specific path within repository to download
23    pub target_path: String,
24}
25
26impl Default for DownloadConfig {
27    fn default() -> Self {
28        Self {
29            output_dir: "downloads".to_string(),
30            list_only: false,
31            recursive: true,
32            target_path: "docs".to_string(),
33        }
34    }
35}
36
37/// Documentation downloader using git clone approach.
38///
39/// This downloader fetches documentation files by cloning the repository with git.
40/// It automatically discovers documentation directories and filters files based on
41/// common documentation patterns.
42pub struct GitHubDocsDownloader {
43    repo: RepoSpec,
44    config: DownloadConfig,
45}
46
47impl GitHubDocsDownloader {
48    /// Create a new documentation downloader.
49    ///
50    /// # Arguments
51    ///
52    /// * `repo` - Repository specification (owner/name)
53    /// * `config` - Download configuration
54    #[must_use]
55    pub fn new(repo: RepoSpec, config: DownloadConfig) -> Self {
56        Self { repo, config }
57    }
58
59    /// Discover all documentation directories in the repository.
60    ///
61    /// # Errors
62    ///
63    /// Returns `GitHubDocsError` if directory discovery fails.
64    pub fn find_docs_directories(&self) -> Result<Vec<DocsDirectory>> {
65        Ok(self.find_docs_directories_git())
66    }
67
68    /// Find documentation directories using git clone approach.
69    fn find_docs_directories_git(&self) -> Vec<DocsDirectory> {
70        println!(
71            "Using sparse checkout for path: {}",
72            self.config.target_path
73        );
74        // Return the target path directly since we always have one from the tree URL
75        vec![DocsDirectory::new(self.config.target_path.clone())]
76    }
77
78    /// Get all documentation files from the specified directories.
79    ///
80    /// # Arguments
81    ///
82    /// * `docs_dirs` - Directories to scan for documentation files
83    ///
84    /// # Errors
85    ///
86    /// Returns `GitHubDocsError` if file discovery fails.
87    pub fn get_all_documentation_files(
88        &self,
89        docs_dirs: &[DocsDirectory],
90    ) -> Result<Vec<DocumentationFile>> {
91        let mut all_files = Vec::new();
92
93        for docs_dir in docs_dirs {
94            println!("Scanning {docs_dir}...");
95
96            let files = self.get_documentation_files_git(docs_dir)?;
97
98            println!("Found {} documentation files in {}", files.len(), docs_dir);
99            for file in &files {
100                println!("  - {} ({})", file.path, file.size);
101            }
102
103            all_files.extend(files);
104        }
105
106        Ok(all_files)
107    }
108
109    /// Get documentation files from a directory using git clone approach.
110    fn get_documentation_files_git(
111        &self,
112        docs_dir: &DocsDirectory,
113    ) -> Result<Vec<DocumentationFile>> {
114        // Create temporary directory and clone
115        let temp_dir = TempDir::new()?;
116        let repo_path = temp_dir.path().join(self.repo.name.as_str());
117
118        let clone_url = format!(
119            "https://github.com/{}/{}.git",
120            self.repo.owner.as_str(),
121            self.repo.name.as_str()
122        );
123
124        // Use sparse checkout for the specific documentation path
125        // Clone with no checkout
126        let output = Command::new("git")
127            .args(["clone", "--no-checkout", "--depth", "1", &clone_url])
128            .current_dir(temp_dir.path())
129            .output()?;
130
131        if !output.status.success() {
132            let stderr = String::from_utf8_lossy(&output.stderr);
133            return Err(GitHubDocsError::git_operation_failed(
134                format!("git clone --no-checkout {clone_url}"),
135                stderr,
136            ));
137        }
138
139        // Enable sparse checkout
140        let output = Command::new("git")
141            .args(["config", "core.sparseCheckout", "true"])
142            .current_dir(&repo_path)
143            .output()?;
144
145        if !output.status.success() {
146            let stderr = String::from_utf8_lossy(&output.stderr);
147            return Err(GitHubDocsError::git_operation_failed(
148                "git config core.sparseCheckout true".to_string(),
149                stderr,
150            ));
151        }
152
153        // Set sparse checkout paths
154        let sparse_info_dir = repo_path.join(".git").join("info");
155        std::fs::create_dir_all(&sparse_info_dir)?;
156        let sparse_checkout_file = sparse_info_dir.join("sparse-checkout");
157        std::fs::write(&sparse_checkout_file, format!("{}/*\n", docs_dir.as_str()))?;
158
159        // Checkout the specified paths
160        let output = Command::new("git")
161            .args(["checkout"])
162            .current_dir(&repo_path)
163            .output()?;
164
165        if !output.status.success() {
166            let stderr = String::from_utf8_lossy(&output.stderr);
167            return Err(GitHubDocsError::git_operation_failed(
168                "git checkout".to_string(),
169                stderr,
170            ));
171        }
172
173        let docs_path = repo_path.join(docs_dir.as_str());
174        if !docs_path.exists() {
175            return Ok(Vec::new());
176        }
177
178        let mut doc_files = Vec::new();
179
180        for entry in WalkDir::new(&docs_path)
181            .into_iter()
182            .filter_map(std::result::Result::ok)
183            .filter(|e| e.file_type().is_file())
184        {
185            let file_name = entry.file_name().to_string_lossy();
186            if Self::is_documentation_file(&file_name) {
187                let file_size = entry
188                    .metadata()
189                    .map_err(GitHubDocsError::WalkDirError)?
190                    .len();
191
192                // Copy file immediately while temp directory exists
193                if !self.config.list_only {
194                    // Flatten structure: use only the filename, not the full path
195                    let dest_path = Path::new(&self.config.output_dir).join(entry.file_name());
196
197                    // Create output directory if it doesn't exist
198                    std::fs::create_dir_all(&self.config.output_dir)?;
199                    std::fs::copy(entry.path(), &dest_path)?;
200                }
201
202                // Create documentation file record (URL not needed for git approach)
203                // For flattened structure, use just the filename as the path
204                let flattened_path = Path::new(file_name.as_ref());
205                doc_files.push(DocumentationFile {
206                    name: file_name.to_string().into(),
207                    path: FilePath::new(flattened_path.to_path_buf()),
208                    download_url: crate::types::DownloadUrl::parse("file://downloaded")?,
209                    size: file_size.into(),
210                    docs_directory: docs_dir.clone(),
211                });
212            }
213        }
214
215        Ok(doc_files)
216    }
217
218    /// Show download summary for files.
219    ///
220    /// # Arguments
221    ///
222    /// * `files` - Documentation files that were processed
223    ///
224    /// # Errors
225    ///
226    /// This function does not return errors in the current implementation.
227    pub fn download_files(&self, files: &[DocumentationFile]) -> Result<()> {
228        if self.config.list_only {
229            Self::print_file_summary(files);
230            return Ok(());
231        }
232
233        // Files are already downloaded during get_documentation_files_git
234        println!("\nDownload complete!");
235        println!(
236            "  Downloaded {} files to {}",
237            files.len(),
238            self.config.output_dir
239        );
240
241        Self::print_file_summary(files);
242        Ok(())
243    }
244
245    /// Print a summary of discovered files.
246    fn print_file_summary(files: &[DocumentationFile]) {
247        println!("\nTotal documentation files found: {}", files.len());
248
249        let total_size: u64 = files.iter().map(|f| f.size.bytes()).sum();
250        println!("Total size: {total_size} bytes");
251
252        // Group files by directory
253        let mut dirs_summary = std::collections::HashMap::new();
254        for file in files {
255            let entry = dirs_summary
256                .entry(file.docs_directory.as_str())
257                .or_insert((0, 0u64));
258            entry.0 += 1;
259            entry.1 += file.size.bytes();
260        }
261
262        println!("\nFiles by directory:");
263        for (dir, (count, size)) in dirs_summary {
264            println!("  {dir}: {count} files ({size} bytes)");
265        }
266    }
267
268    /// Check if a file appears to be documentation based on its name and extension.
269    fn is_documentation_file(filename: &str) -> bool {
270        let filename_lower = filename.to_lowercase();
271
272        // Check file extensions
273        let doc_extensions = [
274            ".md",
275            ".mdx",
276            ".markdown",
277            ".txt",
278            ".rst",
279            ".adoc",
280            ".asciidoc",
281            ".org",
282            ".tex",
283            ".pdf",
284            ".html",
285            ".htm",
286            ".xml",
287        ];
288
289        if doc_extensions
290            .iter()
291            .any(|ext| filename_lower.ends_with(ext))
292        {
293            return true;
294        }
295
296        // Check common documentation filenames
297        let doc_names = [
298            "readme",
299            "changelog",
300            "changes",
301            "news",
302            "history",
303            "license",
304            "copying",
305            "authors",
306            "contributors",
307            "todo",
308            "install",
309            "installation",
310            "usage",
311            "guide",
312            "tutorial",
313            "faq",
314            "api",
315            "reference",
316            "manual",
317            "docs",
318            "documentation",
319        ];
320
321        doc_names.iter().any(|name| {
322            filename_lower == *name
323                || filename_lower.starts_with(&format!("{name}."))
324                || filename_lower.starts_with(&format!("{name}_"))
325                || filename_lower.starts_with(&format!("{name}-"))
326        })
327    }
328
329    /// Get the repository specification.
330    #[must_use]
331    pub fn repo(&self) -> &RepoSpec {
332        &self.repo
333    }
334}