Skip to main content

socket_patch_core/crawlers/
ruby_crawler.rs

1use std::collections::{HashMap, HashSet};
2use std::path::{Path, PathBuf};
3
4use super::types::{CrawledPackage, CrawlerOptions};
5
6/// Ruby/RubyGems ecosystem crawler for discovering gems in Bundler vendor
7/// directories or global gem installation paths.
8pub struct RubyCrawler;
9
10impl RubyCrawler {
11    /// Create a new `RubyCrawler`.
12    pub fn new() -> Self {
13        Self
14    }
15
16    // ------------------------------------------------------------------
17    // Public API
18    // ------------------------------------------------------------------
19
20    /// Get gem installation paths based on options.
21    ///
22    /// In local mode, checks `vendor/bundle/ruby/*/gems/` first (Bundler
23    /// deployment layout), but only if `Gemfile` or `Gemfile.lock` exists
24    /// in the cwd. Falls back to querying `gem env gemdir`.
25    ///
26    /// In global mode, queries `gem env gemdir` and `gem env gempath`, plus
27    /// well-known fallback paths for rbenv, rvm, Homebrew, and system Ruby.
28    pub async fn get_gem_paths(
29        &self,
30        options: &CrawlerOptions,
31    ) -> Result<Vec<PathBuf>, std::io::Error> {
32        if options.global || options.global_prefix.is_some() {
33            if let Some(ref custom) = options.global_prefix {
34                return Ok(vec![custom.clone()]);
35            }
36            return Ok(Self::get_global_gem_paths().await);
37        }
38
39        // Local mode: check vendor/bundle first
40        let vendor_gems = Self::get_vendor_bundle_paths(&options.cwd).await;
41        if !vendor_gems.is_empty() {
42            return Ok(vendor_gems);
43        }
44
45        // Only fall back to global gem paths if this looks like a Ruby project
46        let has_gemfile = tokio::fs::metadata(options.cwd.join("Gemfile"))
47            .await
48            .is_ok();
49        let has_gemfile_lock = tokio::fs::metadata(options.cwd.join("Gemfile.lock"))
50            .await
51            .is_ok();
52
53        if has_gemfile || has_gemfile_lock {
54            // Try gem env gemdir
55            let mut paths = Vec::new();
56            if let Some(gemdir) = Self::run_gem_env("gemdir").await {
57                let gems_path = PathBuf::from(gemdir).join("gems");
58                if is_dir(&gems_path).await {
59                    paths.push(gems_path);
60                }
61            }
62            if !paths.is_empty() {
63                return Ok(paths);
64            }
65        }
66
67        // Not a Ruby project — return empty
68        Ok(Vec::new())
69    }
70
71    /// Crawl all discovered gem paths and return every package found.
72    pub async fn crawl_all(&self, options: &CrawlerOptions) -> Vec<CrawledPackage> {
73        let mut packages = Vec::new();
74        let mut seen = HashSet::new();
75
76        let gem_paths = self.get_gem_paths(options).await.unwrap_or_default();
77
78        for gem_path in &gem_paths {
79            let found = self.scan_gem_dir(gem_path, &mut seen).await;
80            packages.extend(found);
81        }
82
83        packages
84    }
85
86    /// Find specific packages by PURL inside a single gem directory.
87    ///
88    /// Gem directories follow the `<name>-<version>` pattern.
89    pub async fn find_by_purls(
90        &self,
91        gem_path: &Path,
92        purls: &[String],
93    ) -> Result<HashMap<String, CrawledPackage>, std::io::Error> {
94        let mut result: HashMap<String, CrawledPackage> = HashMap::new();
95
96        for purl in purls {
97            if let Some((name, version)) = crate::utils::purl::parse_gem_purl(purl) {
98                let gem_dir = gem_path.join(format!("{name}-{version}"));
99                if self.verify_gem_at_path(&gem_dir).await {
100                    result.insert(
101                        purl.clone(),
102                        CrawledPackage {
103                            name: name.to_string(),
104                            version: version.to_string(),
105                            namespace: None,
106                            purl: purl.clone(),
107                            path: gem_dir,
108                        },
109                    );
110                }
111            }
112        }
113
114        Ok(result)
115    }
116
117    // ------------------------------------------------------------------
118    // Private helpers
119    // ------------------------------------------------------------------
120
121    /// Find `vendor/bundle/ruby/*/gems/` directories.
122    async fn get_vendor_bundle_paths(cwd: &Path) -> Vec<PathBuf> {
123        let vendor_ruby = cwd.join("vendor").join("bundle").join("ruby");
124        let mut paths = Vec::new();
125
126        let mut entries = match tokio::fs::read_dir(&vendor_ruby).await {
127            Ok(rd) => rd,
128            Err(_) => return paths,
129        };
130
131        while let Ok(Some(entry)) = entries.next_entry().await {
132            let ft = match entry.file_type().await {
133                Ok(ft) => ft,
134                Err(_) => continue,
135            };
136            if ft.is_dir() {
137                let gems_dir = vendor_ruby.join(entry.file_name()).join("gems");
138                if is_dir(&gems_dir).await {
139                    paths.push(gems_dir);
140                }
141            }
142        }
143
144        paths
145    }
146
147    /// Get global gem paths by querying `gem env` and checking well-known locations.
148    async fn get_global_gem_paths() -> Vec<PathBuf> {
149        let mut paths = Vec::new();
150        let mut seen = HashSet::new();
151
152        // gem env gemdir
153        if let Some(gemdir) = Self::run_gem_env("gemdir").await {
154            let gems_path = PathBuf::from(gemdir).join("gems");
155            if is_dir(&gems_path).await && seen.insert(gems_path.clone()) {
156                paths.push(gems_path);
157            }
158        }
159
160        // gem env gempath (colon-separated)
161        if let Some(gempath) = Self::run_gem_env("gempath").await {
162            for segment in gempath.split(':') {
163                let segment = segment.trim();
164                if segment.is_empty() {
165                    continue;
166                }
167                let gems_path = PathBuf::from(segment).join("gems");
168                if is_dir(&gems_path).await && seen.insert(gems_path.clone()) {
169                    paths.push(gems_path);
170                }
171            }
172        }
173
174        // Fallback well-known paths
175        let home = std::env::var("HOME")
176            .or_else(|_| std::env::var("USERPROFILE"))
177            .unwrap_or_else(|_| "~".to_string());
178        let home = PathBuf::from(home);
179
180        let fallback_globs = [
181            home.join(".gem").join("ruby"),
182            home.join(".rbenv").join("versions"),
183            home.join(".rvm").join("gems"),
184        ];
185
186        for base in &fallback_globs {
187            if let Ok(mut entries) = tokio::fs::read_dir(base).await {
188                while let Ok(Some(entry)) = entries.next_entry().await {
189                    let ft = match entry.file_type().await {
190                        Ok(ft) => ft,
191                        Err(_) => continue,
192                    };
193                    if !ft.is_dir() {
194                        continue;
195                    }
196
197                    let entry_path = base.join(entry.file_name());
198
199                    // ~/.gem/ruby/*/gems/
200                    let gems_dir = entry_path.join("gems");
201                    if is_dir(&gems_dir).await && seen.insert(gems_dir.clone()) {
202                        paths.push(gems_dir);
203                        continue;
204                    }
205
206                    // ~/.rbenv/versions/*/lib/ruby/gems/*/gems/
207                    let lib_ruby_gems = entry_path.join("lib").join("ruby").join("gems");
208                    if let Ok(mut sub_entries) = tokio::fs::read_dir(&lib_ruby_gems).await {
209                        while let Ok(Some(sub_entry)) = sub_entries.next_entry().await {
210                            let gems_dir = lib_ruby_gems.join(sub_entry.file_name()).join("gems");
211                            if is_dir(&gems_dir).await && seen.insert(gems_dir.clone()) {
212                                paths.push(gems_dir);
213                            }
214                        }
215                    }
216                }
217            }
218        }
219
220        // System paths
221        let system_bases = [
222            PathBuf::from("/usr/lib/ruby/gems"),
223            PathBuf::from("/usr/local/lib/ruby/gems"),
224            PathBuf::from("/opt/homebrew/lib/ruby/gems"),
225        ];
226
227        for base in &system_bases {
228            if let Ok(mut entries) = tokio::fs::read_dir(base).await {
229                while let Ok(Some(entry)) = entries.next_entry().await {
230                    let gems_dir = base.join(entry.file_name()).join("gems");
231                    if is_dir(&gems_dir).await && seen.insert(gems_dir.clone()) {
232                        paths.push(gems_dir);
233                    }
234                }
235            }
236        }
237
238        paths
239    }
240
241    /// Run `gem env <key>` and return the trimmed stdout.
242    async fn run_gem_env(key: &str) -> Option<String> {
243        let output = std::process::Command::new("gem")
244            .args(["env", key])
245            .output()
246            .ok()?;
247
248        if !output.status.success() {
249            return None;
250        }
251
252        let stdout = String::from_utf8_lossy(&output.stdout).trim().to_string();
253        if stdout.is_empty() {
254            None
255        } else {
256            Some(stdout)
257        }
258    }
259
260    /// Scan a gem directory and return all valid gem packages found.
261    async fn scan_gem_dir(
262        &self,
263        gem_path: &Path,
264        seen: &mut HashSet<String>,
265    ) -> Vec<CrawledPackage> {
266        let mut results = Vec::new();
267
268        let mut entries = match tokio::fs::read_dir(gem_path).await {
269            Ok(rd) => rd,
270            Err(_) => return results,
271        };
272
273        let mut entry_list = Vec::new();
274        while let Ok(Some(entry)) = entries.next_entry().await {
275            entry_list.push(entry);
276        }
277
278        for entry in entry_list {
279            let ft = match entry.file_type().await {
280                Ok(ft) => ft,
281                Err(_) => continue,
282            };
283            if !ft.is_dir() {
284                continue;
285            }
286
287            let dir_name = entry.file_name();
288            let dir_name_str = dir_name.to_string_lossy();
289
290            // Skip hidden directories
291            if dir_name_str.starts_with('.') {
292                continue;
293            }
294
295            let gem_dir = gem_path.join(&*dir_name_str);
296
297            // Parse name-version from directory name
298            if let Some((name, version)) = Self::parse_dir_name_version(&dir_name_str) {
299                // Verify it looks like a gem (has .gemspec or lib/)
300                if !self.verify_gem_at_path(&gem_dir).await {
301                    continue;
302                }
303
304                let purl = crate::utils::purl::build_gem_purl(&name, &version);
305
306                if seen.contains(&purl) {
307                    continue;
308                }
309                seen.insert(purl.clone());
310
311                results.push(CrawledPackage {
312                    name,
313                    version,
314                    namespace: None,
315                    purl,
316                    path: gem_dir,
317                });
318            }
319        }
320
321        results
322    }
323
324    /// Verify that a directory looks like an installed gem.
325    /// Checks for a `.gemspec` file or a `lib/` directory.
326    async fn verify_gem_at_path(&self, path: &Path) -> bool {
327        if !is_dir(path).await {
328            return false;
329        }
330
331        // Check for lib/ directory
332        if is_dir(&path.join("lib")).await {
333            return true;
334        }
335
336        // Check for any .gemspec file
337        if let Ok(mut entries) = tokio::fs::read_dir(path).await {
338            while let Ok(Some(entry)) = entries.next_entry().await {
339                if let Some(name) = entry.file_name().to_str() {
340                    if name.ends_with(".gemspec") {
341                        return true;
342                    }
343                }
344            }
345        }
346
347        false
348    }
349
350    /// Parse a gem directory name into (name, version).
351    ///
352    /// Gem directories follow the pattern `<name>-<version>`, where the
353    /// version is the last `-`-separated component that starts with a digit.
354    fn parse_dir_name_version(dir_name: &str) -> Option<(String, String)> {
355        // Find the last '-' followed by a digit
356        let mut split_idx = None;
357        for (i, _) in dir_name.match_indices('-') {
358            if dir_name[i + 1..].starts_with(|c: char| c.is_ascii_digit()) {
359                split_idx = Some(i);
360            }
361        }
362        let idx = split_idx?;
363        let name = &dir_name[..idx];
364        let version = &dir_name[idx + 1..];
365        if name.is_empty() || version.is_empty() {
366            return None;
367        }
368        Some((name.to_string(), version.to_string()))
369    }
370}
371
372impl Default for RubyCrawler {
373    fn default() -> Self {
374        Self::new()
375    }
376}
377
378/// Check whether a path is a directory.
379async fn is_dir(path: &Path) -> bool {
380    tokio::fs::metadata(path)
381        .await
382        .map(|m| m.is_dir())
383        .unwrap_or(false)
384}
385
386#[cfg(test)]
387mod tests {
388    use super::*;
389
390    #[test]
391    fn test_parse_gem_dir_name() {
392        assert_eq!(
393            RubyCrawler::parse_dir_name_version("rails-7.1.0"),
394            Some(("rails".to_string(), "7.1.0".to_string()))
395        );
396        assert_eq!(
397            RubyCrawler::parse_dir_name_version("nokogiri-1.16.5"),
398            Some(("nokogiri".to_string(), "1.16.5".to_string()))
399        );
400        assert_eq!(
401            RubyCrawler::parse_dir_name_version("activerecord-7.1.3.2"),
402            Some(("activerecord".to_string(), "7.1.3.2".to_string()))
403        );
404        assert_eq!(
405            RubyCrawler::parse_dir_name_version("net-http-0.4.1"),
406            Some(("net-http".to_string(), "0.4.1".to_string()))
407        );
408        assert!(RubyCrawler::parse_dir_name_version("no-version-here").is_none());
409        assert!(RubyCrawler::parse_dir_name_version("noversion").is_none());
410    }
411
412    #[tokio::test]
413    async fn test_find_by_purls_gem() {
414        let dir = tempfile::tempdir().unwrap();
415        let rails_dir = dir.path().join("rails-7.1.0");
416        tokio::fs::create_dir_all(rails_dir.join("lib")).await.unwrap();
417
418        let crawler = RubyCrawler::new();
419        let purls = vec![
420            "pkg:gem/rails@7.1.0".to_string(),
421            "pkg:gem/nokogiri@1.16.5".to_string(),
422        ];
423        let result = crawler.find_by_purls(dir.path(), &purls).await.unwrap();
424
425        assert_eq!(result.len(), 1);
426        assert!(result.contains_key("pkg:gem/rails@7.1.0"));
427        assert!(!result.contains_key("pkg:gem/nokogiri@1.16.5"));
428    }
429
430    #[tokio::test]
431    async fn test_crawl_all_gems() {
432        let dir = tempfile::tempdir().unwrap();
433
434        // Create fake gem directories with lib/
435        let rails_dir = dir.path().join("rails-7.1.0");
436        tokio::fs::create_dir_all(rails_dir.join("lib")).await.unwrap();
437
438        let nokogiri_dir = dir.path().join("nokogiri-1.16.5");
439        tokio::fs::create_dir_all(nokogiri_dir.join("lib")).await.unwrap();
440
441        let crawler = RubyCrawler::new();
442        let options = CrawlerOptions {
443            cwd: dir.path().to_path_buf(),
444            global: false,
445            global_prefix: Some(dir.path().to_path_buf()),
446            batch_size: 100,
447        };
448
449        let packages = crawler.crawl_all(&options).await;
450        assert_eq!(packages.len(), 2);
451
452        let purls: HashSet<_> = packages.iter().map(|p| p.purl.as_str()).collect();
453        assert!(purls.contains("pkg:gem/rails@7.1.0"));
454        assert!(purls.contains("pkg:gem/nokogiri@1.16.5"));
455    }
456
457    #[tokio::test]
458    async fn test_get_gem_paths_with_vendor_bundle() {
459        let dir = tempfile::tempdir().unwrap();
460        let vendor_gems = dir
461            .path()
462            .join("vendor")
463            .join("bundle")
464            .join("ruby")
465            .join("3.2.0")
466            .join("gems");
467        tokio::fs::create_dir_all(&vendor_gems).await.unwrap();
468
469        let paths = RubyCrawler::get_vendor_bundle_paths(dir.path()).await;
470        assert_eq!(paths.len(), 1);
471        assert_eq!(paths[0], vendor_gems);
472    }
473
474    #[tokio::test]
475    async fn test_deduplication() {
476        let dir = tempfile::tempdir().unwrap();
477
478        // Create a single gem directory
479        let rails_dir = dir.path().join("rails-7.1.0");
480        tokio::fs::create_dir_all(rails_dir.join("lib")).await.unwrap();
481
482        let crawler = RubyCrawler::new();
483        let options = CrawlerOptions {
484            cwd: dir.path().to_path_buf(),
485            global: false,
486            global_prefix: Some(dir.path().to_path_buf()),
487            batch_size: 100,
488        };
489
490        let packages = crawler.crawl_all(&options).await;
491        assert_eq!(packages.len(), 1);
492        assert_eq!(packages[0].purl, "pkg:gem/rails@7.1.0");
493    }
494
495    #[tokio::test]
496    async fn test_verify_gem_with_gemspec() {
497        let dir = tempfile::tempdir().unwrap();
498        let gem_dir = dir.path().join("rails-7.1.0");
499        tokio::fs::create_dir_all(&gem_dir).await.unwrap();
500        tokio::fs::write(gem_dir.join("rails.gemspec"), "# gemspec")
501            .await
502            .unwrap();
503
504        let crawler = RubyCrawler::new();
505        assert!(crawler.verify_gem_at_path(&gem_dir).await);
506    }
507
508    #[tokio::test]
509    async fn test_verify_gem_empty_dir_fails() {
510        let dir = tempfile::tempdir().unwrap();
511        let gem_dir = dir.path().join("rails-7.1.0");
512        tokio::fs::create_dir_all(&gem_dir).await.unwrap();
513
514        let crawler = RubyCrawler::new();
515        assert!(!crawler.verify_gem_at_path(&gem_dir).await);
516    }
517}