Skip to main content

socket_patch_core/crawlers/
ruby_crawler.rs

1use std::collections::{HashMap, HashSet};
2use std::path::{Path, PathBuf};
3
4use super::types::{CrawledPackage, CrawlerOptions};
5
6/// Ruby/RubyGems ecosystem crawler for discovering gems in Bundler vendor
7/// directories or global gem installation paths.
8pub struct RubyCrawler;
9
10impl RubyCrawler {
11    /// Create a new `RubyCrawler`.
12    pub fn new() -> Self {
13        Self
14    }
15
16    // ------------------------------------------------------------------
17    // Public API
18    // ------------------------------------------------------------------
19
20    /// Get gem installation paths based on options.
21    ///
22    /// In local mode, checks `vendor/bundle/ruby/*/gems/` first (Bundler
23    /// deployment layout), but only if `Gemfile` or `Gemfile.lock` exists
24    /// in the cwd. Falls back to querying `gem env gemdir`.
25    ///
26    /// In global mode, queries `gem env gemdir` and `gem env gempath`, plus
27    /// well-known fallback paths for rbenv, rvm, Homebrew, and system Ruby.
28    pub async fn get_gem_paths(
29        &self,
30        options: &CrawlerOptions,
31    ) -> Result<Vec<PathBuf>, std::io::Error> {
32        if options.global || options.global_prefix.is_some() {
33            if let Some(ref custom) = options.global_prefix {
34                return Ok(vec![custom.clone()]);
35            }
36            return Ok(Self::get_global_gem_paths().await);
37        }
38
39        // Local mode: check vendor/bundle first
40        let vendor_gems = Self::get_vendor_bundle_paths(&options.cwd).await;
41        if !vendor_gems.is_empty() {
42            return Ok(vendor_gems);
43        }
44
45        // Only fall back to global gem paths if this looks like a Ruby project
46        let has_gemfile = tokio::fs::metadata(options.cwd.join("Gemfile"))
47            .await
48            .is_ok();
49        let has_gemfile_lock = tokio::fs::metadata(options.cwd.join("Gemfile.lock"))
50            .await
51            .is_ok();
52
53        if has_gemfile || has_gemfile_lock {
54            // Try gem env gemdir
55            let mut paths = Vec::new();
56            if let Some(gemdir) = Self::run_gem_env("gemdir").await {
57                let gems_path = PathBuf::from(gemdir).join("gems");
58                if is_dir(&gems_path).await {
59                    paths.push(gems_path);
60                }
61            }
62            if !paths.is_empty() {
63                return Ok(paths);
64            }
65        }
66
67        // Not a Ruby project — return empty
68        Ok(Vec::new())
69    }
70
71    /// Crawl all discovered gem paths and return every package found.
72    pub async fn crawl_all(&self, options: &CrawlerOptions) -> Vec<CrawledPackage> {
73        let mut packages = Vec::new();
74        let mut seen = HashSet::new();
75
76        let gem_paths = self.get_gem_paths(options).await.unwrap_or_default();
77
78        for gem_path in &gem_paths {
79            let found = self.scan_gem_dir(gem_path, &mut seen).await;
80            packages.extend(found);
81        }
82
83        packages
84    }
85
86    /// Find specific packages by PURL inside a single gem directory.
87    ///
88    /// Gem directories follow the `<name>-<version>` pattern.
89    pub async fn find_by_purls(
90        &self,
91        gem_path: &Path,
92        purls: &[String],
93    ) -> Result<HashMap<String, CrawledPackage>, std::io::Error> {
94        let mut result: HashMap<String, CrawledPackage> = HashMap::new();
95
96        for purl in purls {
97            if let Some((name, version)) = crate::utils::purl::parse_gem_purl(purl) {
98                let gem_dir = gem_path.join(format!("{name}-{version}"));
99                if self.verify_gem_at_path(&gem_dir).await {
100                    result.insert(
101                        purl.clone(),
102                        CrawledPackage {
103                            name: name.to_string(),
104                            version: version.to_string(),
105                            namespace: None,
106                            purl: purl.clone(),
107                            path: gem_dir,
108                        },
109                    );
110                }
111            }
112        }
113
114        Ok(result)
115    }
116
117    // ------------------------------------------------------------------
118    // Private helpers
119    // ------------------------------------------------------------------
120
121    /// Find `vendor/bundle/ruby/*/gems/` directories.
122    async fn get_vendor_bundle_paths(cwd: &Path) -> Vec<PathBuf> {
123        let vendor_ruby = cwd.join("vendor").join("bundle").join("ruby");
124        let mut paths = Vec::new();
125
126        for entry in crate::utils::fs::list_dir_entries(&vendor_ruby).await {
127            if !crate::utils::fs::entry_is_dir(&entry).await {
128                continue;
129            }
130            let gems_dir = vendor_ruby.join(entry.file_name()).join("gems");
131            if is_dir(&gems_dir).await {
132                paths.push(gems_dir);
133            }
134        }
135        paths
136    }
137
138    /// Get global gem paths by querying `gem env` and checking well-known locations.
139    async fn get_global_gem_paths() -> Vec<PathBuf> {
140        let mut paths = Vec::new();
141        let mut seen = HashSet::new();
142
143        // gem env gemdir
144        if let Some(gemdir) = Self::run_gem_env("gemdir").await {
145            let gems_path = PathBuf::from(gemdir).join("gems");
146            if is_dir(&gems_path).await && seen.insert(gems_path.clone()) {
147                paths.push(gems_path);
148            }
149        }
150
151        // gem env gempath (colon-separated)
152        if let Some(gempath) = Self::run_gem_env("gempath").await {
153            for segment in gempath.split(':') {
154                let segment = segment.trim();
155                if segment.is_empty() {
156                    continue;
157                }
158                let gems_path = PathBuf::from(segment).join("gems");
159                if is_dir(&gems_path).await && seen.insert(gems_path.clone()) {
160                    paths.push(gems_path);
161                }
162            }
163        }
164
165        // Fallback well-known paths
166        let home = std::env::var("HOME")
167            .or_else(|_| std::env::var("USERPROFILE"))
168            .unwrap_or_else(|_| "~".to_string());
169        let home = PathBuf::from(home);
170
171        let fallback_globs = [
172            home.join(".gem").join("ruby"),
173            home.join(".rbenv").join("versions"),
174            home.join(".rvm").join("gems"),
175        ];
176
177        for base in &fallback_globs {
178            for entry in crate::utils::fs::list_dir_entries(base).await {
179                if !crate::utils::fs::entry_is_dir(&entry).await {
180                    continue;
181                }
182
183                let entry_path = base.join(entry.file_name());
184
185                // ~/.gem/ruby/*/gems/
186                let gems_dir = entry_path.join("gems");
187                if is_dir(&gems_dir).await && seen.insert(gems_dir.clone()) {
188                    paths.push(gems_dir);
189                    continue;
190                }
191
192                // ~/.rbenv/versions/*/lib/ruby/gems/*/gems/
193                let lib_ruby_gems = entry_path.join("lib").join("ruby").join("gems");
194                for sub_entry in crate::utils::fs::list_dir_entries(&lib_ruby_gems).await {
195                    let gems_dir = lib_ruby_gems.join(sub_entry.file_name()).join("gems");
196                    if is_dir(&gems_dir).await && seen.insert(gems_dir.clone()) {
197                        paths.push(gems_dir);
198                    }
199                }
200            }
201        }
202
203        // System paths
204        let system_bases = [
205            PathBuf::from("/usr/lib/ruby/gems"),
206            PathBuf::from("/usr/local/lib/ruby/gems"),
207            PathBuf::from("/opt/homebrew/lib/ruby/gems"),
208        ];
209
210        for base in &system_bases {
211            for entry in crate::utils::fs::list_dir_entries(base).await {
212                let gems_dir = base.join(entry.file_name()).join("gems");
213                if is_dir(&gems_dir).await && seen.insert(gems_dir.clone()) {
214                    paths.push(gems_dir);
215                }
216            }
217        }
218
219        paths
220    }
221
222    /// Run `gem env <key>` and return the trimmed stdout.
223    async fn run_gem_env(key: &str) -> Option<String> {
224        Self::run_gem_env_with(&crate::utils::process::SystemCommandRunner, key)
225    }
226
227    /// Version of `run_gem_env` that accepts an injected
228    /// `CommandRunner`. Tests use this with a `MockCommandRunner` to
229    /// exercise the success arm (gem binary present, stdout parsed)
230    /// without requiring ruby on the host's PATH.
231    fn run_gem_env_with(
232        runner: &dyn crate::utils::process::CommandRunner,
233        key: &str,
234    ) -> Option<String> {
235        parse_gem_env_output(runner.run("gem", &["env", key]).as_deref().unwrap_or(""))
236    }
237
238    /// Scan a gem directory and return all valid gem packages found.
239    async fn scan_gem_dir(
240        &self,
241        gem_path: &Path,
242        seen: &mut HashSet<String>,
243    ) -> Vec<CrawledPackage> {
244        let mut results = Vec::new();
245
246        for entry in crate::utils::fs::list_dir_entries(gem_path).await {
247            if !crate::utils::fs::entry_is_dir(&entry).await {
248                continue;
249            }
250
251            let dir_name = entry.file_name();
252            let dir_name_str = dir_name.to_string_lossy();
253
254            // Skip hidden directories
255            if dir_name_str.starts_with('.') {
256                continue;
257            }
258
259            let gem_dir = gem_path.join(&*dir_name_str);
260
261            // Parse name-version from directory name
262            if let Some((name, version)) = Self::parse_dir_name_version(&dir_name_str) {
263                // Verify it looks like a gem (has .gemspec or lib/)
264                if !self.verify_gem_at_path(&gem_dir).await {
265                    continue;
266                }
267
268                let purl = crate::utils::purl::build_gem_purl(&name, &version);
269
270                if seen.contains(&purl) {
271                    continue;
272                }
273                seen.insert(purl.clone());
274
275                results.push(CrawledPackage {
276                    name,
277                    version,
278                    namespace: None,
279                    purl,
280                    path: gem_dir,
281                });
282            }
283        }
284
285        results
286    }
287
288    /// Verify that a directory looks like an installed gem.
289    /// Checks for a `.gemspec` file or a `lib/` directory.
290    async fn verify_gem_at_path(&self, path: &Path) -> bool {
291        if !is_dir(path).await {
292            return false;
293        }
294
295        // Check for lib/ directory
296        if is_dir(&path.join("lib")).await {
297            return true;
298        }
299
300        // Check for any .gemspec file
301        for entry in crate::utils::fs::list_dir_entries(path).await {
302            if let Some(name) = entry.file_name().to_str() {
303                if name.ends_with(".gemspec") {
304                    return true;
305                }
306            }
307        }
308
309        false
310    }
311
312    /// Parse a gem directory name into (name, version).
313    ///
314    /// Gem directories follow the pattern `<name>-<version>`, where the
315    /// version is the last `-`-separated component that starts with a digit.
316    fn parse_dir_name_version(dir_name: &str) -> Option<(String, String)> {
317        // Find the last '-' followed by a digit
318        let mut split_idx = None;
319        for (i, _) in dir_name.match_indices('-') {
320            if dir_name[i + 1..].starts_with(|c: char| c.is_ascii_digit()) {
321                split_idx = Some(i);
322            }
323        }
324        let idx = split_idx?;
325        let name = &dir_name[..idx];
326        let version = &dir_name[idx + 1..];
327        if name.is_empty() || version.is_empty() {
328            return None;
329        }
330        Some((name.to_string(), version.to_string()))
331    }
332}
333
334impl Default for RubyCrawler {
335    fn default() -> Self {
336        Self::new()
337    }
338}
339
340/// Pure parser for `gem env <key>` stdout. Returns the trimmed path
341/// string or `None` on empty input. Extracted so the helper logic is
342/// unit-testable without shelling out to the gem CLI.
343pub fn parse_gem_env_output(stdout: &str) -> Option<String> {
344    let s = stdout.trim().to_string();
345    if s.is_empty() {
346        None
347    } else {
348        Some(s)
349    }
350}
351
352/// Check whether a path is a directory.
353async fn is_dir(path: &Path) -> bool {
354    tokio::fs::metadata(path)
355        .await
356        .map(|m| m.is_dir())
357        .unwrap_or(false)
358}
359
360#[cfg(test)]
361mod tests {
362    use super::*;
363
364    #[test]
365    fn test_parse_gem_dir_name() {
366        assert_eq!(
367            RubyCrawler::parse_dir_name_version("rails-7.1.0"),
368            Some(("rails".to_string(), "7.1.0".to_string()))
369        );
370        assert_eq!(
371            RubyCrawler::parse_dir_name_version("nokogiri-1.16.5"),
372            Some(("nokogiri".to_string(), "1.16.5".to_string()))
373        );
374        assert_eq!(
375            RubyCrawler::parse_dir_name_version("activerecord-7.1.3.2"),
376            Some(("activerecord".to_string(), "7.1.3.2".to_string()))
377        );
378        assert_eq!(
379            RubyCrawler::parse_dir_name_version("net-http-0.4.1"),
380            Some(("net-http".to_string(), "0.4.1".to_string()))
381        );
382        assert!(RubyCrawler::parse_dir_name_version("no-version-here").is_none());
383        assert!(RubyCrawler::parse_dir_name_version("noversion").is_none());
384    }
385
386    #[tokio::test]
387    async fn test_find_by_purls_gem() {
388        let dir = tempfile::tempdir().unwrap();
389        let rails_dir = dir.path().join("rails-7.1.0");
390        tokio::fs::create_dir_all(rails_dir.join("lib")).await.unwrap();
391
392        let crawler = RubyCrawler::new();
393        let purls = vec![
394            "pkg:gem/rails@7.1.0".to_string(),
395            "pkg:gem/nokogiri@1.16.5".to_string(),
396        ];
397        let result = crawler.find_by_purls(dir.path(), &purls).await.unwrap();
398
399        assert_eq!(result.len(), 1);
400        assert!(result.contains_key("pkg:gem/rails@7.1.0"));
401        assert!(!result.contains_key("pkg:gem/nokogiri@1.16.5"));
402    }
403
404    #[tokio::test]
405    async fn test_crawl_all_gems() {
406        let dir = tempfile::tempdir().unwrap();
407
408        // Create fake gem directories with lib/
409        let rails_dir = dir.path().join("rails-7.1.0");
410        tokio::fs::create_dir_all(rails_dir.join("lib")).await.unwrap();
411
412        let nokogiri_dir = dir.path().join("nokogiri-1.16.5");
413        tokio::fs::create_dir_all(nokogiri_dir.join("lib")).await.unwrap();
414
415        let crawler = RubyCrawler::new();
416        let options = CrawlerOptions {
417            cwd: dir.path().to_path_buf(),
418            global: false,
419            global_prefix: Some(dir.path().to_path_buf()),
420            batch_size: 100,
421        };
422
423        let packages = crawler.crawl_all(&options).await;
424        assert_eq!(packages.len(), 2);
425
426        let purls: HashSet<_> = packages.iter().map(|p| p.purl.as_str()).collect();
427        assert!(purls.contains("pkg:gem/rails@7.1.0"));
428        assert!(purls.contains("pkg:gem/nokogiri@1.16.5"));
429    }
430
431    #[tokio::test]
432    async fn test_get_gem_paths_with_vendor_bundle() {
433        let dir = tempfile::tempdir().unwrap();
434        let vendor_gems = dir
435            .path()
436            .join("vendor")
437            .join("bundle")
438            .join("ruby")
439            .join("3.2.0")
440            .join("gems");
441        tokio::fs::create_dir_all(&vendor_gems).await.unwrap();
442
443        let paths = RubyCrawler::get_vendor_bundle_paths(dir.path()).await;
444        assert_eq!(paths.len(), 1);
445        assert_eq!(paths[0], vendor_gems);
446    }
447
448    #[tokio::test]
449    async fn test_deduplication() {
450        let dir = tempfile::tempdir().unwrap();
451
452        // Create a single gem directory
453        let rails_dir = dir.path().join("rails-7.1.0");
454        tokio::fs::create_dir_all(rails_dir.join("lib")).await.unwrap();
455
456        let crawler = RubyCrawler::new();
457        let options = CrawlerOptions {
458            cwd: dir.path().to_path_buf(),
459            global: false,
460            global_prefix: Some(dir.path().to_path_buf()),
461            batch_size: 100,
462        };
463
464        let packages = crawler.crawl_all(&options).await;
465        assert_eq!(packages.len(), 1);
466        assert_eq!(packages[0].purl, "pkg:gem/rails@7.1.0");
467    }
468
469    #[tokio::test]
470    async fn test_verify_gem_with_gemspec() {
471        let dir = tempfile::tempdir().unwrap();
472        let gem_dir = dir.path().join("rails-7.1.0");
473        tokio::fs::create_dir_all(&gem_dir).await.unwrap();
474        tokio::fs::write(gem_dir.join("rails.gemspec"), "# gemspec")
475            .await
476            .unwrap();
477
478        let crawler = RubyCrawler::new();
479        assert!(crawler.verify_gem_at_path(&gem_dir).await);
480    }
481
482    #[tokio::test]
483    async fn test_verify_gem_empty_dir_fails() {
484        let dir = tempfile::tempdir().unwrap();
485        let gem_dir = dir.path().join("rails-7.1.0");
486        tokio::fs::create_dir_all(&gem_dir).await.unwrap();
487
488        let crawler = RubyCrawler::new();
489        assert!(!crawler.verify_gem_at_path(&gem_dir).await);
490    }
491
492    /// `"-1.0.0"` — match_indices finds `i=0` (followed by `1`),
493    /// split_idx ends up Some(0), name slice is empty. The defensive
494    /// empty-name guard at the bottom of parse_dir_name_version
495    /// rejects rather than producing a `Gem("", "1.0.0")` ghost.
496    #[test]
497    fn test_parse_dir_name_version_empty_name_guard() {
498        assert_eq!(RubyCrawler::parse_dir_name_version("-1.0.0"), None);
499    }
500}