1use crate::error::{GitHubDocsError, Result};
7use crate::types::{DocsDirectory, DocumentationFile, FilePath, RepoSpec};
8use std::path::Path;
9use std::process::Command;
10use tempfile::TempDir;
11use walkdir::WalkDir;
12
13#[derive(Debug, Clone)]
15pub struct DownloadConfig {
16 pub output_dir: String,
18 pub list_only: bool,
20 pub recursive: bool,
22 pub target_path: String,
24}
25
26impl Default for DownloadConfig {
27 fn default() -> Self {
28 Self {
29 output_dir: "downloads".to_string(),
30 list_only: false,
31 recursive: true,
32 target_path: "docs".to_string(),
33 }
34 }
35}
36
37pub struct GitHubDocsDownloader {
43 repo: RepoSpec,
44 config: DownloadConfig,
45}
46
47impl GitHubDocsDownloader {
48 #[must_use]
55 pub fn new(repo: RepoSpec, config: DownloadConfig) -> Self {
56 Self { repo, config }
57 }
58
59 pub fn find_docs_directories(&self) -> Result<Vec<DocsDirectory>> {
65 Ok(self.find_docs_directories_git())
66 }
67
68 fn find_docs_directories_git(&self) -> Vec<DocsDirectory> {
70 println!(
71 "Using sparse checkout for path: {}",
72 self.config.target_path
73 );
74 vec![DocsDirectory::new(self.config.target_path.clone())]
76 }
77
78 pub fn get_all_documentation_files(
88 &self,
89 docs_dirs: &[DocsDirectory],
90 ) -> Result<Vec<DocumentationFile>> {
91 let mut all_files = Vec::new();
92
93 for docs_dir in docs_dirs {
94 println!("Scanning {docs_dir}...");
95
96 let files = self.get_documentation_files_git(docs_dir)?;
97
98 println!("Found {} documentation files in {}", files.len(), docs_dir);
99 for file in &files {
100 println!(" - {} ({})", file.path, file.size);
101 }
102
103 all_files.extend(files);
104 }
105
106 Ok(all_files)
107 }
108
109 fn get_documentation_files_git(
111 &self,
112 docs_dir: &DocsDirectory,
113 ) -> Result<Vec<DocumentationFile>> {
114 let temp_dir = TempDir::new()?;
116 let repo_path = temp_dir.path().join(self.repo.name.as_str());
117
118 let clone_url = format!(
119 "https://github.com/{}/{}.git",
120 self.repo.owner.as_str(),
121 self.repo.name.as_str()
122 );
123
124 let output = Command::new("git")
127 .args(["clone", "--no-checkout", "--depth", "1", &clone_url])
128 .current_dir(temp_dir.path())
129 .output()?;
130
131 if !output.status.success() {
132 let stderr = String::from_utf8_lossy(&output.stderr);
133 return Err(GitHubDocsError::git_operation_failed(
134 format!("git clone --no-checkout {clone_url}"),
135 stderr,
136 ));
137 }
138
139 let output = Command::new("git")
141 .args(["config", "core.sparseCheckout", "true"])
142 .current_dir(&repo_path)
143 .output()?;
144
145 if !output.status.success() {
146 let stderr = String::from_utf8_lossy(&output.stderr);
147 return Err(GitHubDocsError::git_operation_failed(
148 "git config core.sparseCheckout true".to_string(),
149 stderr,
150 ));
151 }
152
153 let sparse_info_dir = repo_path.join(".git").join("info");
155 std::fs::create_dir_all(&sparse_info_dir)?;
156 let sparse_checkout_file = sparse_info_dir.join("sparse-checkout");
157 std::fs::write(&sparse_checkout_file, format!("{}/*\n", docs_dir.as_str()))?;
158
159 let output = Command::new("git")
161 .args(["checkout"])
162 .current_dir(&repo_path)
163 .output()?;
164
165 if !output.status.success() {
166 let stderr = String::from_utf8_lossy(&output.stderr);
167 return Err(GitHubDocsError::git_operation_failed(
168 "git checkout".to_string(),
169 stderr,
170 ));
171 }
172
173 let docs_path = repo_path.join(docs_dir.as_str());
174 if !docs_path.exists() {
175 return Ok(Vec::new());
176 }
177
178 let mut doc_files = Vec::new();
179
180 for entry in WalkDir::new(&docs_path)
181 .into_iter()
182 .filter_map(std::result::Result::ok)
183 .filter(|e| e.file_type().is_file())
184 {
185 let file_name = entry.file_name().to_string_lossy();
186 if Self::is_documentation_file(&file_name) {
187 let file_size = entry
188 .metadata()
189 .map_err(GitHubDocsError::WalkDirError)?
190 .len();
191
192 if !self.config.list_only {
194 let dest_path = Path::new(&self.config.output_dir).join(entry.file_name());
196
197 std::fs::create_dir_all(&self.config.output_dir)?;
199 std::fs::copy(entry.path(), &dest_path)?;
200 }
201
202 let flattened_path = Path::new(file_name.as_ref());
205 doc_files.push(DocumentationFile {
206 name: file_name.to_string().into(),
207 path: FilePath::new(flattened_path.to_path_buf()),
208 download_url: crate::types::DownloadUrl::parse("file://downloaded")?,
209 size: file_size.into(),
210 docs_directory: docs_dir.clone(),
211 });
212 }
213 }
214
215 Ok(doc_files)
216 }
217
218 pub fn download_files(&self, files: &[DocumentationFile]) -> Result<()> {
228 if self.config.list_only {
229 Self::print_file_summary(files);
230 return Ok(());
231 }
232
233 println!("\nDownload complete!");
235 println!(
236 " Downloaded {} files to {}",
237 files.len(),
238 self.config.output_dir
239 );
240
241 Self::print_file_summary(files);
242 Ok(())
243 }
244
245 fn print_file_summary(files: &[DocumentationFile]) {
247 println!("\nTotal documentation files found: {}", files.len());
248
249 let total_size: u64 = files.iter().map(|f| f.size.bytes()).sum();
250 println!("Total size: {total_size} bytes");
251
252 let mut dirs_summary = std::collections::HashMap::new();
254 for file in files {
255 let entry = dirs_summary
256 .entry(file.docs_directory.as_str())
257 .or_insert((0, 0u64));
258 entry.0 += 1;
259 entry.1 += file.size.bytes();
260 }
261
262 println!("\nFiles by directory:");
263 for (dir, (count, size)) in dirs_summary {
264 println!(" {dir}: {count} files ({size} bytes)");
265 }
266 }
267
268 fn is_documentation_file(filename: &str) -> bool {
270 let filename_lower = filename.to_lowercase();
271
272 let doc_extensions = [
274 ".md",
275 ".mdx",
276 ".markdown",
277 ".txt",
278 ".rst",
279 ".adoc",
280 ".asciidoc",
281 ".org",
282 ".tex",
283 ".pdf",
284 ".html",
285 ".htm",
286 ".xml",
287 ];
288
289 if doc_extensions
290 .iter()
291 .any(|ext| filename_lower.ends_with(ext))
292 {
293 return true;
294 }
295
296 let doc_names = [
298 "readme",
299 "changelog",
300 "changes",
301 "news",
302 "history",
303 "license",
304 "copying",
305 "authors",
306 "contributors",
307 "todo",
308 "install",
309 "installation",
310 "usage",
311 "guide",
312 "tutorial",
313 "faq",
314 "api",
315 "reference",
316 "manual",
317 "docs",
318 "documentation",
319 ];
320
321 doc_names.iter().any(|name| {
322 filename_lower == *name
323 || filename_lower.starts_with(&format!("{name}."))
324 || filename_lower.starts_with(&format!("{name}_"))
325 || filename_lower.starts_with(&format!("{name}-"))
326 })
327 }
328
329 #[must_use]
331 pub fn repo(&self) -> &RepoSpec {
332 &self.repo
333 }
334}