1use crate::core::{
2 analysis::{FileMetrics, ProjectAnalysis},
3 error::{AnalysisError, Result},
4 filter::{FilterStats, IntelligentFilter},
5 registry::LanguageRegistry,
6};
7use flate2::read::GzDecoder;
8use futures_util::StreamExt;
9use reqwest::Client;
10use serde::Deserialize;
11use std::io::{Cursor, Read};
12use tar::Archive;
13
14#[cfg(not(target_arch = "wasm32"))]
15use tokio::task;
16
17static USER_AGENT: &str = "bytes-radar/1.0.0";
18
19#[derive(Deserialize)]
20struct GitHubRepoInfo {
21 default_branch: String,
22}
23
24#[cfg(feature = "cli")]
25use indicatif::ProgressBar;
26
27pub struct RemoteAnalyzer {
28 client: Client,
29 github_token: Option<String>,
30 timeout: u64,
31 allow_insecure: bool,
32 filter: IntelligentFilter,
33 #[cfg(feature = "cli")]
34 progress_bar: Option<ProgressBar>,
35}
36
37impl RemoteAnalyzer {
38 pub fn new() -> Self {
39 let mut builder = Client::builder().user_agent(USER_AGENT);
40
41 #[cfg(not(target_arch = "wasm32"))]
42 {
43 builder = builder.timeout(std::time::Duration::from_secs(300));
44 }
45
46 let client = builder.build().expect("Failed to create HTTP client");
47
48 Self {
49 client,
50 github_token: None,
51 timeout: 300,
52 allow_insecure: false,
53 filter: IntelligentFilter::default(),
54 #[cfg(feature = "cli")]
55 progress_bar: None,
56 }
57 }
58
59 #[cfg(feature = "cli")]
60 pub fn set_progress_bar(&mut self, progress_bar: Option<ProgressBar>) {
61 self.progress_bar = progress_bar;
62 }
63
64 pub fn set_github_token(&mut self, token: &str) {
65 self.github_token = Some(token.to_string());
66 self.rebuild_client();
67 }
68
69 pub fn set_timeout(&mut self, timeout: u64) {
70 self.timeout = timeout;
71 self.rebuild_client();
72 }
73
74 pub fn set_allow_insecure(&mut self, allow_insecure: bool) {
75 self.allow_insecure = allow_insecure;
76 self.rebuild_client();
77 }
78
79 pub fn set_filter(&mut self, filter: IntelligentFilter) {
80 self.filter = filter;
81 }
82
83 pub fn set_aggressive_filtering(&mut self, enabled: bool) {
84 if enabled {
85 self.filter = IntelligentFilter::aggressive();
86 } else {
87 self.filter = IntelligentFilter::default();
88 }
89 }
90
91 fn rebuild_client(&mut self) {
92 let mut builder = Client::builder().user_agent(USER_AGENT);
93
94 #[cfg(not(target_arch = "wasm32"))]
95 {
96 builder = builder.timeout(std::time::Duration::from_secs(self.timeout));
97 }
98
99 #[cfg(not(target_arch = "wasm32"))]
100 if self.allow_insecure {
101 builder = builder.danger_accept_invalid_certs(true);
102 }
103
104 if let Some(token) = &self.github_token {
105 let mut headers = reqwest::header::HeaderMap::new();
106 let auth_value = format!("token {}", token);
107 headers.insert(
108 reqwest::header::AUTHORIZATION,
109 auth_value.parse().expect("Invalid token format"),
110 );
111 builder = builder.default_headers(headers);
112 }
113
114 self.client = builder.build().expect("Failed to create HTTP client");
115 }
116
117 pub async fn analyze_url(&self, url: &str) -> Result<ProjectAnalysis> {
118 let download_urls = self.resolve_git_url(url).await?;
119
120 let mut url_errors: Vec<crate::core::error::DownloadUrlError> = Vec::new();
121 let mut total_attempts = 0u32;
122 for download_url in download_urls {
123 total_attempts += 1;
124 match self.analyze_tarball_with_name(&download_url, url).await {
125 Ok(analysis) => return Ok(analysis),
126 Err(e) => {
127 #[cfg(feature = "cli")]
128 log::debug!("Failed to download from {}: {}", download_url, e);
129
130 let error_info = crate::core::error::DownloadUrlError {
131 url: download_url.clone(),
132 error_message: format!("{}", e),
133 error_type: match e {
134 AnalysisError::NetworkError { .. } => "NetworkError".to_string(),
135 AnalysisError::ArchiveError { .. } => "ArchiveError".to_string(),
136 _ => "UnknownError".to_string(),
137 },
138 http_status_code: self.extract_http_status_code(&e),
139 retry_count: 1,
140 };
141
142 url_errors.push(error_info);
143 continue;
144 }
145 }
146 }
147
148 Err(AnalysisError::network(
149 "All download URLs failed".to_string(),
150 ))
151 }
152
153 async fn resolve_git_url(&self, url: &str) -> Result<Vec<String>> {
154 if url.ends_with(".tar.gz") || url.ends_with(".tgz") {
155 return Ok(vec![url.to_string()]);
156 }
157
158 let expanded_url = self.expand_url(url);
159
160 if expanded_url.starts_with("http://") || expanded_url.starts_with("https://") {
161 if !expanded_url.contains("github.com")
162 && !expanded_url.contains("gitlab.com")
163 && !expanded_url.contains("gitlab.")
164 && !expanded_url.contains("bitbucket.org")
165 && !expanded_url.contains("codeberg.org")
166 {
167 if expanded_url.ends_with(".tar.gz") || expanded_url.ends_with(".tgz") {
168 return Ok(vec![expanded_url.to_string()]);
169 } else {
170 return Ok(vec![expanded_url.to_string()]);
171 }
172 }
173 }
174
175 let mut download_urls = Vec::new();
176
177 if let Some(github_url) = self.parse_github_url_with_branch(&expanded_url) {
178 download_urls.push(github_url);
179 }
180
181 if let Some(gitlab_url) = self.parse_gitlab_url_with_branch(&expanded_url) {
182 download_urls.push(gitlab_url);
183 }
184
185 if let Some(bitbucket_url) = self.parse_bitbucket_url_with_branch(&expanded_url) {
186 download_urls.push(bitbucket_url);
187 }
188
189 if let Some(codeberg_url) = self.parse_codeberg_url_with_branch(&expanded_url) {
190 download_urls.push(codeberg_url);
191 }
192
193 if download_urls.is_empty() {
194 let mut branches = vec![
195 "main".to_string(),
196 "master".to_string(),
197 "develop".to_string(),
198 "dev".to_string(),
199 ];
200
201 #[cfg(not(target_arch = "wasm32"))]
202 if expanded_url.contains("github.com") {
203 if let Some(default_branch) = self.get_github_default_branch(&expanded_url).await {
204 branches.insert(0, default_branch);
205 branches.dedup();
206 }
207 }
208
209 #[cfg(target_arch = "wasm32")]
210 if expanded_url.contains("github.com") {
211 branches = vec![
212 "main".to_string(),
213 "master".to_string(),
214 "develop".to_string(),
215 "dev".to_string(),
216 ];
217 }
218
219 for branch in &branches {
220 if let Some(github_url) = self.parse_github_url(&expanded_url, branch) {
221 download_urls.push(github_url);
222 }
223
224 if let Some(gitlab_url) = self.parse_gitlab_url(&expanded_url, branch) {
225 download_urls.push(gitlab_url);
226 }
227
228 if let Some(bitbucket_url) = self.parse_bitbucket_url(&expanded_url, branch) {
229 download_urls.push(bitbucket_url);
230 }
231
232 if let Some(codeberg_url) = self.parse_codeberg_url(&expanded_url, branch) {
233 download_urls.push(codeberg_url);
234 }
235 }
236 }
237
238 if download_urls.is_empty() {
239 return Err(AnalysisError::url_parsing(format!(
240 "Unsupported URL format or no accessible branch found: {}. Please provide a direct tar.gz URL or a supported repository URL.",
241 expanded_url
242 )));
243 }
244
245 Ok(download_urls)
246 }
247
248 fn parse_github_url_with_branch(&self, url: &str) -> Option<String> {
249 if url.contains("github.com") {
250 if url.contains("/tree/") {
251 let parts: Vec<&str> = url.split('/').collect();
252 if let Some(tree_pos) = parts.iter().position(|&x| x == "tree") {
253 if tree_pos + 1 < parts.len() && tree_pos >= 2 {
254 let owner = parts[tree_pos - 2];
255 let repo = parts[tree_pos - 1];
256 let branch = parts[tree_pos + 1];
257 return Some(format!(
258 "https://github.com/{}/{}/archive/refs/heads/{}.tar.gz",
259 owner, repo, branch
260 ));
261 }
262 }
263 }
264
265 if url.contains("/commit/") {
266 return self.extract_github_commit_url(url);
267 }
268 }
269 None
270 }
271
272 fn parse_gitlab_url_with_branch(&self, url: &str) -> Option<String> {
273 if url.contains("gitlab.com") || url.contains("gitlab.") {
274 if url.contains("/-/tree/") {
275 let parts: Vec<&str> = url.split('/').collect();
276 if let Some(tree_pos) = parts.iter().position(|&x| x == "tree") {
277 if tree_pos + 1 < parts.len() && tree_pos >= 3 {
278 let gitlab_pos = parts.iter().position(|&x| x.contains("gitlab")).unwrap();
279 let host = parts[gitlab_pos];
280 let owner = parts[gitlab_pos + 1];
281 let repo = parts[gitlab_pos + 2];
282 let branch = parts[tree_pos + 1];
283 return Some(format!(
284 "https://{}/{}{}/-/archive/{}/{}-{}.tar.gz",
285 host,
286 owner,
287 if parts.len() > gitlab_pos + 3 && parts[gitlab_pos + 3] != "-" {
288 format!("/{}", parts[gitlab_pos + 3..tree_pos - 1].join("/"))
289 } else {
290 String::new()
291 },
292 branch,
293 repo,
294 branch
295 ));
296 }
297 }
298 }
299 }
300 None
301 }
302
303 fn parse_bitbucket_url_with_branch(&self, url: &str) -> Option<String> {
304 if url.contains("bitbucket.org") {
305 if url.contains("/commits/") {
306 let parts: Vec<&str> = url.split('/').collect();
307 if let Some(commits_pos) = parts.iter().position(|&x| x == "commits") {
308 if commits_pos + 1 < parts.len() && commits_pos >= 2 {
309 let owner = parts[commits_pos - 2];
310 let repo = parts[commits_pos - 1];
311 let commit = parts[commits_pos + 1];
312 return Some(format!(
313 "https://bitbucket.org/{}/{}/get/{}.tar.gz",
314 owner, repo, commit
315 ));
316 }
317 }
318 }
319
320 if url.contains("/branch/") {
321 let parts: Vec<&str> = url.split('/').collect();
322 if let Some(branch_pos) = parts.iter().position(|&x| x == "branch") {
323 if branch_pos + 1 < parts.len() && branch_pos >= 2 {
324 let owner = parts[branch_pos - 2];
325 let repo = parts[branch_pos - 1];
326 let branch = parts[branch_pos + 1];
327 return Some(format!(
328 "https://bitbucket.org/{}/{}/get/{}.tar.gz",
329 owner, repo, branch
330 ));
331 }
332 }
333 }
334 }
335 None
336 }
337
338 fn parse_codeberg_url_with_branch(&self, url: &str) -> Option<String> {
339 if url.contains("codeberg.org") {
340 if url.contains("/commit/") {
341 let parts: Vec<&str> = url.split('/').collect();
342 if let Some(commit_pos) = parts.iter().position(|&x| x == "commit") {
343 if commit_pos + 1 < parts.len() && commit_pos >= 2 {
344 let owner = parts[commit_pos - 2];
345 let repo = parts[commit_pos - 1];
346 let commit = parts[commit_pos + 1];
347 return Some(format!(
348 "https://codeberg.org/{}/{}/archive/{}.tar.gz",
349 owner, repo, commit
350 ));
351 }
352 }
353 }
354
355 if url.contains("/src/branch/") {
356 let parts: Vec<&str> = url.split('/').collect();
357 if let Some(branch_pos) = parts.iter().position(|&x| x == "branch") {
358 if branch_pos + 1 < parts.len() && branch_pos >= 3 {
359 let owner = parts[branch_pos - 3];
360 let repo = parts[branch_pos - 2];
361 let branch = parts[branch_pos + 1];
362 return Some(format!(
363 "https://codeberg.org/{}/{}/archive/{}.tar.gz",
364 owner, repo, branch
365 ));
366 }
367 }
368 }
369 }
370 None
371 }
372
373 fn parse_bitbucket_url(&self, url: &str, branch: &str) -> Option<String> {
374 if url.contains("bitbucket.org") {
375 let parts: Vec<&str> = url.split('/').collect();
376 if let Some(bitbucket_pos) = parts.iter().position(|&x| x == "bitbucket.org") {
377 if bitbucket_pos + 2 < parts.len() {
378 let owner = parts[bitbucket_pos + 1];
379 let repo = parts[bitbucket_pos + 2];
380 return Some(format!(
381 "https://bitbucket.org/{}/{}/get/{}.tar.gz",
382 owner, repo, branch
383 ));
384 }
385 }
386 }
387 None
388 }
389
390 fn parse_codeberg_url(&self, url: &str, branch: &str) -> Option<String> {
391 if url.contains("codeberg.org") {
392 let parts: Vec<&str> = url.split('/').collect();
393 if let Some(codeberg_pos) = parts.iter().position(|&x| x == "codeberg.org") {
394 if codeberg_pos + 2 < parts.len() {
395 let owner = parts[codeberg_pos + 1];
396 let repo = parts[codeberg_pos + 2];
397 return Some(format!(
398 "https://codeberg.org/{}/{}/archive/{}.tar.gz",
399 owner, repo, branch
400 ));
401 }
402 }
403 }
404 None
405 }
406
407 async fn check_url_exists(&self, url: &str) -> bool {
408 if let Ok(response) = self.client.head(url).send().await {
409 response.status().is_success()
410 } else {
411 false
412 }
413 }
414
415 fn extract_http_status_code(&self, error: &AnalysisError) -> Option<u16> {
416 match error {
417 AnalysisError::NetworkError { message } => {
418 if message.contains("HTTP request failed with status: ") {
419 if let Some(start) = message.find("HTTP request failed with status: ") {
420 let status_start = start + "HTTP request failed with status: ".len();
421 let status_str = &message[status_start..];
422 if let Some(end) = status_str.find(' ') {
423 status_str[..end].parse().ok()
424 } else {
425 status_str.parse().ok()
426 }
427 } else {
428 None
429 }
430 } else {
431 None
432 }
433 }
434 _ => None,
435 }
436 }
437
438 fn expand_url(&self, url: &str) -> String {
439 if url.starts_with("http://") || url.starts_with("https://") {
440 return url.to_string();
441 }
442
443 if url.contains('/') && !url.starts_with("http://") && !url.starts_with("https://") {
444 let parts: Vec<&str> = url.split('@').collect();
445 let repo_part = parts[0];
446 let branch_or_commit = parts.get(1);
447
448 let path_parts: Vec<&str> = repo_part.split('/').collect();
449 if path_parts.len() == 2 {
450 if let Some(branch) = branch_or_commit {
451 if branch.len() >= 7 && branch.chars().all(|c| c.is_ascii_hexdigit()) {
452 return format!("https://github.com/{}/commit/{}", repo_part, branch);
453 } else {
454 return format!("https://github.com/{}/tree/{}", repo_part, branch);
455 }
456 } else {
457 return format!("https://github.com/{}", repo_part);
458 }
459 }
460 }
461
462 url.to_string()
463 }
464
465 #[cfg(not(target_arch = "wasm32"))]
466 async fn get_github_default_branch(&self, url: &str) -> Option<String> {
467 let (owner, repo) = self.extract_github_owner_repo(url)?;
468
469 let api_url = format!("https://api.github.com/repos/{}/{}", owner, repo);
470
471 match self.client.get(&api_url).send().await {
472 Ok(response) => {
473 if response.status().is_success() {
474 match response.json::<GitHubRepoInfo>().await {
475 Ok(repo_info) => {
476 #[cfg(feature = "cli")]
477 log::debug!(
478 "GitHub API: Found default branch '{}' for {}/{}",
479 repo_info.default_branch,
480 owner,
481 repo
482 );
483 Some(repo_info.default_branch)
484 }
485 Err(_) => {
486 #[cfg(feature = "cli")]
487 log::debug!(
488 "GitHub API: Failed to parse response for {}/{}",
489 owner,
490 repo
491 );
492 None
493 }
494 }
495 } else {
496 #[cfg(feature = "cli")]
497 log::debug!(
498 "GitHub API: Request failed with status {} for {}/{}",
499 response.status(),
500 owner,
501 repo
502 );
503 None
504 }
505 }
506 Err(_) => {
507 #[cfg(feature = "cli")]
508 log::debug!("GitHub API: Network error for {}/{}", owner, repo);
509 None
510 }
511 }
512 }
513
514 fn extract_github_owner_repo(&self, url: &str) -> Option<(String, String)> {
515 let url = url.trim_end_matches('/');
516
517 if let Some(github_url) = url.strip_prefix("https://github.com/") {
518 let parts: Vec<&str> = github_url.split('/').collect();
519 if parts.len() >= 2 {
520 return Some((parts[0].to_string(), parts[1].to_string()));
521 }
522 }
523
524 if url.contains("github.com") {
525 let parts: Vec<&str> = url.split('/').collect();
526 if let Some(github_pos) = parts.iter().position(|&x| x == "github.com") {
527 if github_pos + 2 < parts.len() {
528 return Some((
529 parts[github_pos + 1].to_string(),
530 parts[github_pos + 2].to_string(),
531 ));
532 }
533 }
534 }
535
536 let parts: Vec<&str> = url.split('@').collect();
537 let repo_part = parts[0];
538 let path_parts: Vec<&str> = repo_part.split('/').collect();
539 if path_parts.len() == 2 {
540 return Some((path_parts[0].to_string(), path_parts[1].to_string()));
541 }
542
543 None
544 }
545
546 fn parse_github_url(&self, url: &str, branch: &str) -> Option<String> {
547 let url = url.trim_end_matches('/');
548
549 if url.contains("github.com") {
550 if let Some(commit_url) = self.extract_github_commit_url(url) {
551 return Some(commit_url);
552 }
553
554 if let Some(repo_url) = self.extract_github_repo_url(url, branch) {
555 return Some(repo_url);
556 }
557 }
558
559 None
560 }
561
562 fn extract_github_commit_url(&self, url: &str) -> Option<String> {
563 if url.contains("/commit/") {
564 let parts: Vec<&str> = url.split('/').collect();
565 if let Some(commit_pos) = parts.iter().position(|&x| x == "commit") {
566 if commit_pos + 1 < parts.len() {
567 let owner = parts.get(parts.len() - 4)?;
568 let repo = parts.get(parts.len() - 3)?;
569 let commit = parts.get(commit_pos + 1)?;
570 return Some(format!(
571 "https://github.com/{}/{}/archive/{}.tar.gz",
572 owner, repo, commit
573 ));
574 }
575 }
576 }
577 None
578 }
579
580 fn extract_github_repo_url(&self, url: &str, branch: &str) -> Option<String> {
581 let parts: Vec<&str> = url.split('/').collect();
582 if parts.len() >= 2 && parts.contains(&"github.com") {
583 if let Some(github_pos) = parts.iter().position(|&x| x == "github.com") {
584 if github_pos + 2 < parts.len() {
585 let owner = parts[github_pos + 1];
586 let repo = parts[github_pos + 2];
587 return Some(format!(
588 "https://github.com/{}/{}/archive/refs/heads/{}.tar.gz",
589 owner, repo, branch
590 ));
591 }
592 }
593 }
594 None
595 }
596
597 fn parse_gitlab_url(&self, url: &str, branch: &str) -> Option<String> {
598 let url = url.trim_end_matches('/');
599
600 if url.contains("gitlab.com") || url.contains("gitlab.") {
601 let parts: Vec<&str> = url.split('/').collect();
602 if let Some(gitlab_pos) = parts.iter().position(|&x| x.contains("gitlab")) {
603 if gitlab_pos + 2 < parts.len() {
604 let host = parts[gitlab_pos];
605 let owner = parts[gitlab_pos + 1];
606 let repo = parts[gitlab_pos + 2];
607 return Some(format!(
608 "https://{}/{}{}/-/archive/{}/{}-{}.tar.gz",
609 host,
610 owner,
611 if parts.len() > gitlab_pos + 3 {
612 format!("/{}", parts[gitlab_pos + 3..].join("/"))
613 } else {
614 String::new()
615 },
616 branch,
617 repo,
618 branch
619 ));
620 }
621 }
622 }
623
624 None
625 }
626
627 async fn analyze_tarball_with_name(
628 &self,
629 download_url: &str,
630 original_url: &str,
631 ) -> Result<ProjectAnalysis> {
632 let project_name = self.extract_project_name_from_original(original_url);
633 let mut project_analysis = ProjectAnalysis::new(project_name);
634
635 let response = self
636 .client
637 .get(download_url)
638 .send()
639 .await
640 .map_err(|e| AnalysisError::network(format!("Failed to fetch URL: {}", e)))?;
641
642 if !response.status().is_success() {
643 return Err(AnalysisError::network(format!(
644 "HTTP request failed with status: {}",
645 response.status()
646 )));
647 }
648
649 let total_size = response.content_length();
650
651 #[cfg(feature = "cli")]
652 if let Some(pb) = &self.progress_bar {
653 if let Some(size) = total_size {
654 use indicatif::ProgressStyle;
655 pb.set_style(
656 ProgressStyle::default_bar()
657 .template("[{elapsed_precise}] [{wide_bar:.cyan/blue}] {decimal_bytes_per_sec} {binary_bytes}/{binary_total_bytes} ({eta}) {msg}")
658 .unwrap_or_else(|_| ProgressStyle::default_bar())
659 .progress_chars("#>-"),
660 );
661 pb.set_length(size);
662 pb.set_message("Downloading and processing...");
663 } else {
664 pb.set_message("Downloading and processing...");
665 pb.enable_steady_tick(std::time::Duration::from_millis(120));
666 }
667 }
668
669 #[cfg(target_arch = "wasm32")]
670 {
671 let bytes = response
672 .bytes()
673 .await
674 .map_err(|e| AnalysisError::network(format!("Failed to read response: {}", e)))?;
675
676 #[cfg(feature = "cli")]
677 if let Some(pb) = &self.progress_bar {
678 pb.set_message("Processing archive...");
679 }
680
681 self.process_tarball_bytes(&bytes, &mut project_analysis)
682 .await?;
683 }
684
685 #[cfg(not(target_arch = "wasm32"))]
686 {
687 let stream = response.bytes_stream();
688 let stream_reader = StreamReader::new(
689 stream,
690 #[cfg(feature = "cli")]
691 self.progress_bar.clone(),
692 total_size,
693 );
694
695 #[cfg(feature = "cli")]
696 if let Some(pb) = &self.progress_bar {
697 pb.set_message("Processing archive...");
698 }
699
700 self.process_tarball_stream(stream_reader, &mut project_analysis)
701 .await?;
702 }
703
704 Ok(project_analysis)
705 }
706
707 async fn process_tarball_stream(
708 &self,
709 stream_reader: StreamReader,
710 project_analysis: &mut ProjectAnalysis,
711 ) -> Result<()> {
712 #[cfg(not(target_arch = "wasm32"))]
713 {
714 let filter = self.filter.clone();
715 let metrics_result = task::spawn_blocking(move || {
716 let decoder = GzDecoder::new(stream_reader);
717 let mut archive = Archive::new(decoder);
718
719 let entries = archive.entries().map_err(|e| {
720 AnalysisError::archive(format!("Failed to read tar entries: {}", e))
721 })?;
722
723 let mut collected_metrics = Vec::new();
724 let mut stats = FilterStats::new();
725
726 for entry in entries {
727 let entry = entry.map_err(|e| {
728 AnalysisError::archive(format!("Failed to read tar entry: {}", e))
729 })?;
730
731 if let Ok(metrics) = Self::process_tar_entry_sync(entry, &filter, &mut stats) {
732 collected_metrics.push(metrics);
733 }
734 }
735
736 #[cfg(feature = "cli")]
737 log::info!(
738 "Filter stats: processed {}/{} files ({:.1}% filtered), saved {}",
739 stats.processed,
740 stats.total_entries,
741 stats.filter_ratio() * 100.0,
742 stats.format_bytes_saved()
743 );
744
745 Ok::<Vec<FileMetrics>, AnalysisError>(collected_metrics)
746 })
747 .await
748 .map_err(|e| AnalysisError::archive(format!("Task join error: {}", e)))??;
749
750 for metrics in metrics_result {
751 project_analysis.add_file_metrics(metrics)?;
752 }
753 }
754
755 #[cfg(target_arch = "wasm32")]
756 {
757 let decoder = GzDecoder::new(stream_reader);
758 let mut archive = Archive::new(decoder);
759
760 let entries = archive.entries().map_err(|e| {
761 AnalysisError::archive(format!("Failed to read tar entries: {}", e))
762 })?;
763
764 let mut stats = FilterStats::new();
765
766 for entry in entries {
767 let entry = entry.map_err(|e| {
768 AnalysisError::archive(format!("Failed to read tar entry: {}", e))
769 })?;
770
771 if let Ok(metrics) = Self::process_tar_entry_sync(entry, &self.filter, &mut stats) {
772 project_analysis.add_file_metrics(metrics)?;
773 }
774 }
775
776 web_sys::console::log_1(
777 &format!(
778 "Filter stats: processed {}/{} files ({:.1}% filtered), saved {}",
779 stats.processed,
780 stats.total_entries,
781 stats.filter_ratio() * 100.0,
782 stats.format_bytes_saved()
783 )
784 .into(),
785 );
786 }
787
788 Ok(())
789 }
790
791 #[cfg(target_arch = "wasm32")]
792 async fn process_tarball_bytes(
793 &self,
794 bytes: &bytes::Bytes,
795 project_analysis: &mut ProjectAnalysis,
796 ) -> Result<()> {
797 let cursor = Cursor::new(bytes.as_ref());
798 let decoder = GzDecoder::new(cursor);
799 let mut archive = Archive::new(decoder);
800
801 let entries = archive
802 .entries()
803 .map_err(|e| AnalysisError::archive(format!("Failed to read tar entries: {}", e)))?;
804
805 let mut stats = FilterStats::new();
806
807 for entry in entries {
808 let entry = entry
809 .map_err(|e| AnalysisError::archive(format!("Failed to read tar entry: {}", e)))?;
810
811 if let Ok(metrics) = Self::process_tar_entry_sync(entry, &self.filter, &mut stats) {
812 project_analysis.add_file_metrics(metrics)?;
813 }
814 }
815
816 web_sys::console::log_1(
817 &format!(
818 "Filter stats: processed {}/{} files ({:.1}% filtered), saved {}",
819 stats.processed,
820 stats.total_entries,
821 stats.filter_ratio() * 100.0,
822 stats.format_bytes_saved()
823 )
824 .into(),
825 );
826
827 Ok(())
828 }
829
830 fn process_tar_entry_sync<R: Read>(
831 mut entry: tar::Entry<'_, R>,
832 filter: &IntelligentFilter,
833 stats: &mut FilterStats,
834 ) -> Result<FileMetrics> {
835 let header = entry.header();
836 let path = header
837 .path()
838 .map_err(|e| AnalysisError::archive(format!("Invalid path in tar entry: {}", e)))?;
839
840 let file_path = path.to_string_lossy().to_string();
841
842 if !header.entry_type().is_file() || header.size().unwrap_or(0) == 0 {
843 return Err(AnalysisError::archive("Not a file or empty".to_string()));
844 }
845
846 let file_size = header.size().unwrap_or(0);
847
848 let should_process = filter.should_process_file(&file_path, file_size);
849 stats.record_entry(file_size, !should_process);
850
851 if !should_process {
852 return Err(AnalysisError::archive("File filtered out".to_string()));
853 }
854
855 let language = LanguageRegistry::detect_by_path(&file_path)
856 .map(|l| l.name.clone())
857 .unwrap_or_else(|| "Text".to_string());
858
859 let mut content = String::new();
860 if entry.read_to_string(&mut content).is_err() {
861 return Err(AnalysisError::archive(
862 "Failed to read file content".to_string(),
863 ));
864 }
865
866 analyze_file_content(&file_path, &content, &language, file_size)
867 }
868
869 fn extract_project_name_from_original(&self, url: &str) -> String {
870 if url.starts_with("http://") || url.starts_with("https://") {
871 let url = url.trim_end_matches('/');
872
873 if url.contains("/tree/") {
874 let parts: Vec<&str> = url.split('/').collect();
875 if let Some(tree_pos) = parts.iter().position(|&x| x == "tree") {
876 if tree_pos > 1 {
877 let repo = parts[tree_pos - 1];
878 let branch = parts.get(tree_pos + 1).unwrap_or(&"unknown");
879 return format!("{}@{}", repo, branch);
880 }
881 }
882 }
883
884 if url.contains("/commit/") {
885 let parts: Vec<&str> = url.split('/').collect();
886 if let Some(commit_pos) = parts.iter().position(|&x| x == "commit") {
887 if commit_pos > 1 {
888 let repo = parts[commit_pos - 1];
889 let commit = parts.get(commit_pos + 1).unwrap_or(&"unknown");
890 return format!("{}@{}", repo, &commit[..7.min(commit.len())]);
891 }
892 }
893 }
894
895 let parts: Vec<&str> = url.split('/').collect();
896 if parts.len() >= 2 {
897 let repo = parts[parts.len() - 1];
898 return format!("{}@main", repo);
899 }
900 } else if url.contains('/') && !url.contains('.') {
901 let parts: Vec<&str> = url.split('@').collect();
902 let repo_part = parts[0];
903 let branch = parts.get(1).unwrap_or(&"main");
904
905 if let Some(repo_name) = repo_part.split('/').last() {
906 return format!("{}@{}", repo_name, branch);
907 }
908 }
909
910 "remote-project".to_string()
911 }
912
913 #[allow(dead_code)]
914 fn extract_project_name(&self, url: &str) -> String {
915 let url_path = url.trim_end_matches('/');
916
917 if let Some(filename) = url_path.split('/').last() {
918 if filename.ends_with(".tar.gz") {
919 return filename.trim_end_matches(".tar.gz").to_string();
920 }
921 if filename.ends_with(".tgz") {
922 return filename.trim_end_matches(".tgz").to_string();
923 }
924 return filename.to_string();
925 }
926
927 "remote-project".to_string()
928 }
929
930 fn format_bytes_simple(bytes: u64) -> String {
931 const UNITS: &[&str] = &["B", "KiB", "MiB", "GiB", "TiB"];
932 const THRESHOLD: f64 = 1024.0;
933
934 if bytes == 0 {
935 return "0 B".to_string();
936 }
937
938 let mut size = bytes as f64;
939 let mut unit_index = 0;
940
941 while size >= THRESHOLD && unit_index < UNITS.len() - 1 {
942 size /= THRESHOLD;
943 unit_index += 1;
944 }
945
946 if unit_index == 0 {
947 format!("{} {}", bytes, UNITS[unit_index])
948 } else {
949 format!("{:.1} {}", size, UNITS[unit_index])
950 }
951 }
952}
953
954impl Default for RemoteAnalyzer {
955 fn default() -> Self {
956 Self::new()
957 }
958}
959
960fn analyze_file_content(
961 file_path: &str,
962 content: &str,
963 language: &str,
964 file_size: u64,
965) -> Result<FileMetrics> {
966 let lines: Vec<&str> = content.lines().collect();
967 let total_lines = lines.len();
968
969 let mut code_lines = 0;
970 let mut comment_lines = 0;
971 let mut blank_lines = 0;
972
973 let lang_def = LanguageRegistry::get_language(language);
974 let empty_line_comments = vec![];
975 let empty_multi_line_comments = vec![];
976 let line_comments = lang_def
977 .map(|l| &l.line_comments)
978 .unwrap_or(&empty_line_comments);
979 let multi_line_comments = lang_def
980 .map(|l| &l.multi_line_comments)
981 .unwrap_or(&empty_multi_line_comments);
982
983 let mut in_multi_line_comment = false;
984
985 for line in lines {
986 let trimmed = line.trim();
987
988 if trimmed.is_empty() {
989 blank_lines += 1;
990 continue;
991 }
992
993 let mut is_comment = false;
994
995 if !in_multi_line_comment {
996 for comment_start in line_comments {
997 if trimmed.starts_with(comment_start) {
998 is_comment = true;
999 break;
1000 }
1001 }
1002
1003 for (start, end) in multi_line_comments {
1004 if trimmed.starts_with(start) {
1005 is_comment = true;
1006 if !trimmed.ends_with(end) {
1007 in_multi_line_comment = true;
1008 }
1009 break;
1010 }
1011 }
1012 } else {
1013 is_comment = true;
1014 for (_, end) in multi_line_comments {
1015 if trimmed.ends_with(end) {
1016 in_multi_line_comment = false;
1017 break;
1018 }
1019 }
1020 }
1021
1022 if is_comment {
1023 comment_lines += 1;
1024 } else {
1025 code_lines += 1;
1026 }
1027 }
1028
1029 let metrics = FileMetrics::new(
1030 file_path,
1031 language.to_string(),
1032 total_lines,
1033 code_lines,
1034 comment_lines,
1035 blank_lines,
1036 )?
1037 .with_size_bytes(file_size);
1038
1039 Ok(metrics)
1040}
1041
1042#[cfg(test)]
1043mod tests {
1044 use super::*;
1045
1046 #[test]
1047 fn test_github_url_parsing() {
1048 let analyzer = RemoteAnalyzer::new();
1049
1050 assert_eq!(
1051 analyzer.parse_github_url("https://github.com/user/repo", "main"),
1052 Some("https://github.com/user/repo/archive/refs/heads/main.tar.gz".to_string())
1053 );
1054
1055 assert_eq!(
1056 analyzer.parse_github_url("https://github.com/user/repo/commit/abc123", "main"),
1057 Some("https://github.com/user/repo/archive/abc123.tar.gz".to_string())
1058 );
1059 }
1060
1061 #[test]
1062 fn test_bitbucket_url_parsing() {
1063 let analyzer = RemoteAnalyzer::new();
1064
1065 assert_eq!(
1066 analyzer.parse_bitbucket_url("https://bitbucket.org/user/repo", "main"),
1067 Some("https://bitbucket.org/user/repo/get/main.tar.gz".to_string())
1068 );
1069 }
1070
1071 #[test]
1072 fn test_codeberg_url_parsing() {
1073 let analyzer = RemoteAnalyzer::new();
1074
1075 assert_eq!(
1076 analyzer.parse_codeberg_url("https://codeberg.org/user/repo", "main"),
1077 Some("https://codeberg.org/user/repo/archive/main.tar.gz".to_string())
1078 );
1079 }
1080
1081 #[test]
1082 fn test_extract_project_name() {
1083 let analyzer = RemoteAnalyzer::new();
1084
1085 assert_eq!(
1086 analyzer.extract_project_name("https://example.com/project.tar.gz"),
1087 "project"
1088 );
1089
1090 assert_eq!(
1091 analyzer.extract_project_name("https://github.com/user/repo/archive/main.tar.gz"),
1092 "main"
1093 );
1094 }
1095}
1096
1097use tokio::sync::mpsc;
1098
1099struct StreamReader {
1100 receiver: mpsc::Receiver<std::io::Result<bytes::Bytes>>,
1101 current_chunk: Option<Cursor<bytes::Bytes>>,
1102 finished: bool,
1103}
1104
1105impl StreamReader {
1106 #[cfg(not(target_arch = "wasm32"))]
1107 fn new(
1108 stream: impl futures_util::Stream<Item = reqwest::Result<bytes::Bytes>> + Send + 'static,
1109 #[cfg(feature = "cli")] progress_bar: Option<ProgressBar>,
1110 total_size: Option<u64>,
1111 ) -> Self {
1112 let (tx, rx) = mpsc::channel(32);
1113
1114 tokio::spawn(async move {
1115 let mut downloaded = 0u64;
1116 let mut stream = Box::pin(stream);
1117
1118 while let Some(chunk_result) = stream.next().await {
1119 match chunk_result {
1120 Ok(chunk) => {
1121 downloaded += chunk.len() as u64;
1122
1123 #[cfg(feature = "cli")]
1124 if let Some(pb) = &progress_bar {
1125 if let Some(_total) = total_size {
1126 pb.set_position(downloaded);
1127 } else {
1128 let formatted = RemoteAnalyzer::format_bytes_simple(downloaded);
1129 pb.set_message(format!("Downloaded {}...", formatted));
1130 }
1131 }
1132
1133 if tx.send(Ok(chunk)).await.is_err() {
1134 break;
1135 }
1136 }
1137 Err(e) => {
1138 let _ = tx
1139 .send(Err(std::io::Error::new(
1140 std::io::ErrorKind::Other,
1141 format!("Stream error: {}", e),
1142 )))
1143 .await;
1144 break;
1145 }
1146 }
1147 }
1148 });
1149
1150 Self {
1151 receiver: rx,
1152 current_chunk: None,
1153 finished: false,
1154 }
1155 }
1156
1157 #[cfg(target_arch = "wasm32")]
1158 fn new(
1159 stream: impl futures_util::Stream<Item = reqwest::Result<bytes::Bytes>> + 'static,
1160 #[cfg(feature = "cli")] _progress_bar: Option<ProgressBar>,
1161 _total_size: Option<u64>,
1162 ) -> Self {
1163 let (tx, rx) = mpsc::channel(32);
1164
1165 wasm_bindgen_futures::spawn_local(async move {
1166 let mut downloaded = 0u64;
1167 let mut stream = Box::pin(stream);
1168
1169 while let Some(chunk_result) = stream.next().await {
1170 match chunk_result {
1171 Ok(chunk) => {
1172 downloaded += chunk.len() as u64;
1173
1174 #[cfg(feature = "cli")]
1175 if let Some(pb) = &_progress_bar {
1176 if let Some(_total) = _total_size {
1177 pb.set_position(downloaded);
1178 } else {
1179 let formatted = RemoteAnalyzer::format_bytes_simple(downloaded);
1180 pb.set_message(format!("Downloaded {}...", formatted));
1181 }
1182 }
1183
1184 if tx.send(Ok(chunk)).await.is_err() {
1185 break;
1186 }
1187 }
1188 Err(e) => {
1189 let _ = tx
1190 .send(Err(std::io::Error::new(
1191 std::io::ErrorKind::Other,
1192 format!("Stream error: {}", e),
1193 )))
1194 .await;
1195 break;
1196 }
1197 }
1198 }
1199 });
1200
1201 Self {
1202 receiver: rx,
1203 current_chunk: None,
1204 finished: false,
1205 }
1206 }
1207}
1208
1209impl Read for StreamReader {
1210 fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
1211 if let Some(ref mut cursor) = self.current_chunk {
1212 let read = cursor.read(buf)?;
1213 if read > 0 {
1214 return Ok(read);
1215 }
1216 self.current_chunk = None;
1217 }
1218
1219 if self.finished {
1220 return Ok(0);
1221 }
1222
1223 match self.receiver.try_recv() {
1224 Ok(Ok(chunk)) => {
1225 self.current_chunk = Some(Cursor::new(chunk));
1226 if let Some(ref mut cursor) = self.current_chunk {
1227 cursor.read(buf)
1228 } else {
1229 Ok(0)
1230 }
1231 }
1232 Ok(Err(e)) => {
1233 self.finished = true;
1234 Err(e)
1235 }
1236 Err(mpsc::error::TryRecvError::Empty) => {
1237 #[cfg(not(target_arch = "wasm32"))]
1238 {
1239 match self.receiver.blocking_recv() {
1240 Some(Ok(chunk)) => {
1241 self.current_chunk = Some(Cursor::new(chunk));
1242 if let Some(ref mut cursor) = self.current_chunk {
1243 cursor.read(buf)
1244 } else {
1245 Ok(0)
1246 }
1247 }
1248 Some(Err(e)) => {
1249 self.finished = true;
1250 Err(e)
1251 }
1252 None => {
1253 self.finished = true;
1254 Ok(0)
1255 }
1256 }
1257 }
1258 #[cfg(target_arch = "wasm32")]
1259 {
1260 Err(std::io::Error::new(
1261 std::io::ErrorKind::WouldBlock,
1262 "Would block in WASM",
1263 ))
1264 }
1265 }
1266 Err(mpsc::error::TryRecvError::Disconnected) => {
1267 self.finished = true;
1268 Ok(0)
1269 }
1270 }
1271 }
1272}