Skip to main content

provenant/parsers/
ruby.rs

1//! Parser for Ruby/RubyGems package manifests.
2//!
3//! Extracts package metadata, dependencies, and platform information from
4//! Gemfile and Gemfile.lock files used by Ruby/Bundler projects.
5//!
6//! # Supported Formats
7//! - Gemfile (manifest with Ruby DSL)
8//! - Gemfile.lock (lockfile with state machine sections)
9//! - *.gemspec (gem specification files)
10//! - *.gem (gem archive packages)
11//! - metadata.gz-extract (pre-extracted gem metadata)
12//!
13//! # Key Features
14//! - State machine parsing for Gemfile.lock sections (GEM, GIT, PATH, SVN, PLATFORMS, BUNDLED WITH, DEPENDENCIES)
15//! - Regex-based Ruby DSL parsing for Gemfile
16//! - Dependency group handling (:development, :test, etc.)
17//! - Platform-specific gem support
18//! - Pessimistic version operator (~>) support
19//! - Bug Fix #1: Strip .freeze suffix from strings
20//! - Bug Fix #4: Correct dependency scope mapping (:runtime → None, :development → "development")
21//!
22//! # Implementation Notes
23//! - Uses regex for pattern matching (not full Ruby AST)
24//! - Graceful error handling: logs warnings and returns default on parse failure
25//! - PURL type: "gem"
26
27use crate::models::{DatasourceId, Dependency, PackageData, PackageType, Party};
28use crate::parsers::utils::split_name_email;
29use flate2::read::GzDecoder;
30use log::warn;
31use packageurl::PackageUrl;
32use regex::Regex;
33use std::collections::HashMap;
34use std::fs::{self, File};
35use std::io::Read;
36use std::path::{Path, PathBuf};
37use tar::Archive;
38
39use super::PackageParser;
40
41const PACKAGE_TYPE: PackageType = PackageType::Gem;
42
43// =============================================================================
44// Bug Fix #1: Strip .freeze suffix from strings
45// =============================================================================
46
47/// Strips the `.freeze` suffix from Ruby frozen string literals.
48///
49/// In Ruby, `.freeze` makes a string immutable. We need to remove this suffix
50/// when parsing gem names and versions from Gemfile.
51///
52/// # Examples
53/// ```ignore
54/// assert_eq!(strip_freeze_suffix("\"name\".freeze"), "\"name\"");
55/// assert_eq!(strip_freeze_suffix("'1.0.0'.freeze"), "'1.0.0'");
56/// ```
57pub fn strip_freeze_suffix(s: &str) -> &str {
58    s.trim_end_matches(".freeze")
59}
60
61// =============================================================================
62// Gemfile Parser (Ruby DSL)
63// =============================================================================
64
65/// Ruby Gemfile parser for manifest files.
66///
67/// Parses Ruby DSL syntax to extract gem declarations, dependency groups,
68/// platform-specific gems, and version constraints.
69pub struct GemfileParser;
70
71impl PackageParser for GemfileParser {
72    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
73
74    fn extract_packages(path: &Path) -> Vec<PackageData> {
75        let content = match fs::read_to_string(path) {
76            Ok(c) => c,
77            Err(e) => {
78                warn!("Failed to read Gemfile at {:?}: {}", path, e);
79                return vec![default_package_data_with_datasource(DatasourceId::Gemfile)];
80            }
81        };
82
83        vec![parse_gemfile(&content)]
84    }
85
86    fn is_match(path: &Path) -> bool {
87        path.file_name()
88            .and_then(|n| n.to_str())
89            .is_some_and(|name| name == "Gemfile")
90            || path
91                .to_str()
92                .is_some_and(|p| p.contains("data.gz-extract/") && p.ends_with("/Gemfile"))
93    }
94}
95
96/// Parses Gemfile content and extracts dependencies with groups.
97fn parse_gemfile(content: &str) -> PackageData {
98    let mut dependencies = Vec::new();
99    let mut current_groups: Vec<String> = Vec::new();
100
101    // Regex patterns for Gemfile parsing
102    // gem "name", "version", options...
103    let gem_regex = match Regex::new(
104        r#"^\s*gem\s+["']([^"']+)["'](?:\.freeze)?(?:\s*,\s*["']([^"']+)["'](?:\.freeze)?)?(?:\s*,\s*["']([^"']+)["'](?:\.freeze)?)?(?:\s*,\s*(.+))?"#,
105    ) {
106        Ok(r) => r,
107        Err(e) => {
108            warn!("Failed to compile gem regex: {}", e);
109            return default_package_data_with_datasource(DatasourceId::Gemfile);
110        }
111    };
112
113    // group :name do ... end
114    let group_start_regex = match Regex::new(r"^\s*group\s+(.+?)\s+do\s*$") {
115        Ok(r) => r,
116        Err(e) => {
117            warn!("Failed to compile group regex: {}", e);
118            return default_package_data_with_datasource(DatasourceId::Gemfile);
119        }
120    };
121
122    let group_end_regex = match Regex::new(r"^\s*end\s*$") {
123        Ok(r) => r,
124        Err(e) => {
125            warn!("Failed to compile end regex: {}", e);
126            return default_package_data_with_datasource(DatasourceId::Gemfile);
127        }
128    };
129
130    // Parse symbols like :development, :test
131    let symbol_regex = match Regex::new(r":(\w+)") {
132        Ok(r) => r,
133        Err(e) => {
134            warn!("Failed to compile symbol regex: {}", e);
135            return default_package_data_with_datasource(DatasourceId::Gemfile);
136        }
137    };
138
139    for line in content.lines() {
140        let trimmed = line.trim();
141
142        // Skip comments and empty lines
143        if trimmed.is_empty() || trimmed.starts_with('#') {
144            continue;
145        }
146
147        // Check for group start
148        if let Some(caps) = group_start_regex.captures(trimmed) {
149            let groups_str = caps.get(1).map(|m| m.as_str()).unwrap_or("");
150            current_groups.clear();
151            for cap in symbol_regex.captures_iter(groups_str) {
152                if let Some(group_name) = cap.get(1) {
153                    current_groups.push(group_name.as_str().to_string());
154                }
155            }
156            continue;
157        }
158
159        // Check for group end
160        if group_end_regex.is_match(trimmed) {
161            current_groups.clear();
162            continue;
163        }
164
165        // Parse gem declaration
166        if let Some(caps) = gem_regex.captures(trimmed) {
167            let name = strip_freeze_suffix(caps.get(1).map(|m| m.as_str()).unwrap_or(""));
168            if name.is_empty() {
169                continue;
170            }
171
172            // Collect version constraints
173            let mut version_parts = Vec::new();
174            if let Some(v) = caps.get(2) {
175                version_parts.push(strip_freeze_suffix(v.as_str()).to_string());
176            }
177            if let Some(v) = caps.get(3) {
178                let v_str = strip_freeze_suffix(v.as_str());
179                // Check if it looks like a version constraint
180                if looks_like_version_constraint(v_str) {
181                    version_parts.push(v_str.to_string());
182                }
183            }
184
185            let extracted_requirement = if version_parts.is_empty() {
186                None
187            } else {
188                Some(version_parts.join(", "))
189            };
190
191            // Determine scope based on current group
192            // Bug Fix #4: :runtime → None, :development → "development"
193            let (scope, is_runtime, is_optional) = if current_groups.is_empty() {
194                // No group = runtime dependency
195                (None, true, false)
196            } else if current_groups.iter().any(|g| g == "development") {
197                (Some("development".to_string()), false, true)
198            } else if current_groups.iter().any(|g| g == "test") {
199                (Some("test".to_string()), false, true)
200            } else {
201                // Other groups (e.g., :production)
202                let group = current_groups.first().cloned();
203                (group, true, false)
204            };
205
206            // Create PURL
207            let purl = create_gem_purl(name, None);
208
209            dependencies.push(Dependency {
210                purl,
211                extracted_requirement,
212                scope,
213                is_runtime: Some(is_runtime),
214                is_optional: Some(is_optional),
215                is_pinned: None,
216                is_direct: Some(true),
217                resolved_package: None,
218                extra_data: None,
219            });
220        }
221    }
222
223    PackageData {
224        package_type: Some(PACKAGE_TYPE),
225        primary_language: Some("Ruby".to_string()),
226        dependencies,
227        datasource_id: Some(DatasourceId::Gemfile),
228        ..default_package_data()
229    }
230}
231
232/// Checks if a string looks like a version constraint.
233fn looks_like_version_constraint(s: &str) -> bool {
234    s.starts_with('~')
235        || s.starts_with('>')
236        || s.starts_with('<')
237        || s.starts_with('=')
238        || s.starts_with('!')
239        || s.chars().next().is_some_and(|c| c.is_ascii_digit())
240}
241
242// =============================================================================
243// Gemfile.lock Parser (State Machine)
244// =============================================================================
245
246/// Ruby Gemfile.lock parser for lockfiles.
247///
248/// Uses a state machine to parse sections: GEM, GIT, PATH, SVN,
249/// PLATFORMS, BUNDLED WITH, DEPENDENCIES.
250pub struct GemfileLockParser;
251
252impl PackageParser for GemfileLockParser {
253    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
254
255    fn extract_packages(path: &Path) -> Vec<PackageData> {
256        let content = match fs::read_to_string(path) {
257            Ok(c) => c,
258            Err(e) => {
259                warn!("Failed to read Gemfile.lock at {:?}: {}", path, e);
260                return vec![default_package_data_with_datasource(
261                    DatasourceId::GemfileLock,
262                )];
263            }
264        };
265
266        vec![parse_gemfile_lock(&content)]
267    }
268
269    fn is_match(path: &Path) -> bool {
270        path.file_name()
271            .and_then(|n| n.to_str())
272            .is_some_and(|name| name == "Gemfile.lock")
273            || path
274                .to_str()
275                .is_some_and(|p| p.contains("data.gz-extract/") && p.ends_with("/Gemfile.lock"))
276    }
277}
278
279/// Parse state for Gemfile.lock state machine.
280#[derive(Debug, Clone, PartialEq)]
281enum ParseState {
282    None,
283    Gem,
284    Git,
285    Path,
286    Svn,
287    Specs,
288    Platforms,
289    BundledWith,
290    Dependencies,
291}
292
293/// Parsed gem information from Gemfile.lock.
294///
295/// All fields are actively used:
296/// - `gem_type`, `remote`, `revision`, `ref_field`, `branch`, `tag`: Stored in extra_data for GIT/PATH/SVN sources
297/// - `name`, `version`, `platform`, `pinned`: Used for dependency PURL and metadata generation
298/// - `requirements`: Stored as extracted_requirement for version constraints
299#[derive(Debug, Clone, Default)]
300struct GemInfo {
301    name: String,
302    version: Option<String>,
303    platform: Option<String>,
304    gem_type: String,
305    remote: Option<String>,
306    revision: Option<String>,
307    ref_field: Option<String>,
308    branch: Option<String>,
309    tag: Option<String>,
310    pinned: bool,
311    requirements: Vec<String>,
312}
313
314/// Parses Gemfile.lock content using a state machine.
315fn parse_gemfile_lock(content: &str) -> PackageData {
316    let mut state = ParseState::None;
317    let mut dependencies = Vec::new();
318    let mut gems: HashMap<String, GemInfo> = HashMap::new();
319    let mut platforms: Vec<String> = Vec::new();
320    let mut bundler_version: Option<String> = None;
321    let mut current_gem_type = String::new();
322    let mut current_remote: Option<String> = None;
323    let mut current_options: HashMap<String, String> = HashMap::new();
324
325    // DEPS pattern: 2 spaces at line start
326    let deps_regex = match Regex::new(r"^ {2}([^ \)\(,!:]+)(?: \(([^)]+)\))?(!)?$") {
327        Ok(r) => r,
328        Err(e) => {
329            warn!("Failed to compile deps regex: {}", e);
330            return default_package_data_with_datasource(DatasourceId::GemfileLock);
331        }
332    };
333
334    // SPEC_DEPS pattern: 4 spaces at line start
335    let spec_deps_regex = match Regex::new(r"^ {4}([^ \)\(,!:]+)(?: \(([^)]+)\))?$") {
336        Ok(r) => r,
337        Err(e) => {
338            warn!("Failed to compile spec_deps regex: {}", e);
339            return default_package_data_with_datasource(DatasourceId::GemfileLock);
340        }
341    };
342
343    // OPTIONS pattern: key: value
344    let options_regex = match Regex::new(r"^ {2}([a-z]+): (.+)$") {
345        Ok(r) => r,
346        Err(e) => {
347            warn!("Failed to compile options regex: {}", e);
348            return default_package_data_with_datasource(DatasourceId::GemfileLock);
349        }
350    };
351
352    // VERSION pattern for BUNDLED WITH
353    let version_regex = match Regex::new(r"^\s+(\d+(?:\.\d+)+)\s*$") {
354        Ok(r) => r,
355        Err(e) => {
356            warn!("Failed to compile version regex: {}", e);
357            return default_package_data_with_datasource(DatasourceId::GemfileLock);
358        }
359    };
360
361    for line in content.lines() {
362        let trimmed = line.trim_end();
363
364        // Empty line resets state
365        if trimmed.is_empty() {
366            current_options.clear();
367            continue;
368        }
369
370        // Section headers (no leading whitespace) and sub-section headers
371        match trimmed {
372            "GEM" => {
373                state = ParseState::Gem;
374                current_gem_type = "GEM".to_string();
375                current_remote = None;
376                current_options.clear();
377                continue;
378            }
379            "GIT" => {
380                state = ParseState::Git;
381                current_gem_type = "GIT".to_string();
382                current_remote = None;
383                current_options.clear();
384                continue;
385            }
386            "PATH" => {
387                state = ParseState::Path;
388                current_gem_type = "PATH".to_string();
389                current_remote = None;
390                current_options.clear();
391                continue;
392            }
393            "SVN" => {
394                state = ParseState::Svn;
395                current_gem_type = "SVN".to_string();
396                current_remote = None;
397                current_options.clear();
398                continue;
399            }
400            "PLATFORMS" => {
401                state = ParseState::Platforms;
402                continue;
403            }
404            "BUNDLED WITH" => {
405                state = ParseState::BundledWith;
406                continue;
407            }
408            "DEPENDENCIES" => {
409                state = ParseState::Dependencies;
410                continue;
411            }
412            _ => {}
413        }
414
415        // Check for "  specs:" sub-section header (2-space indent) within
416        // GEM/GIT/PATH/SVN sections. This must be checked separately because
417        // the leading whitespace is preserved by trim_end().
418        if trimmed.trim() == "specs:" {
419            state = match state {
420                ParseState::Gem | ParseState::Git | ParseState::Path | ParseState::Svn => {
421                    ParseState::Specs
422                }
423                _ => state,
424            };
425            continue;
426        }
427
428        // Process based on current state
429        match state {
430            ParseState::Gem | ParseState::Git | ParseState::Path | ParseState::Svn => {
431                // Parse options (remote:, revision:, ref:, branch:, tag:)
432                if let Some(caps) = options_regex.captures(line) {
433                    let key = caps.get(1).map(|m| m.as_str()).unwrap_or("");
434                    let value = caps.get(2).map(|m| m.as_str()).unwrap_or("");
435                    current_options.insert(key.to_string(), value.to_string());
436                    if key == "remote" {
437                        current_remote = Some(value.to_string());
438                    }
439                }
440            }
441            ParseState::Specs => {
442                // Parse gem specs (4 spaces indent)
443                if let Some(caps) = spec_deps_regex.captures(line) {
444                    let name = caps.get(1).map(|m| m.as_str()).unwrap_or("").to_string();
445                    let version_str = caps.get(2).map(|m| m.as_str()).unwrap_or("");
446
447                    // Parse version and platform
448                    let (version, platform) = parse_version_platform(version_str);
449
450                    if !name.is_empty() {
451                        let gem_info = GemInfo {
452                            name: name.clone(),
453                            version,
454                            platform,
455                            gem_type: current_gem_type.clone(),
456                            remote: current_remote.clone(),
457                            revision: current_options.get("revision").cloned(),
458                            ref_field: current_options.get("ref").cloned(),
459                            branch: current_options.get("branch").cloned(),
460                            tag: current_options.get("tag").cloned(),
461                            pinned: false,
462                            requirements: Vec::new(),
463                        };
464                        gems.insert(name, gem_info);
465                    }
466                }
467            }
468            ParseState::Platforms => {
469                // Parse platform entries (2 spaces indent)
470                let platform = trimmed.trim();
471                if !platform.is_empty() {
472                    platforms.push(platform.to_string());
473                }
474            }
475            ParseState::BundledWith => {
476                // Parse bundler version
477                if let Some(caps) = version_regex.captures(line) {
478                    bundler_version = caps.get(1).map(|m| m.as_str().to_string());
479                }
480            }
481            ParseState::Dependencies => {
482                // Parse direct dependencies (2 spaces indent)
483                if let Some(caps) = deps_regex.captures(line) {
484                    let name = caps.get(1).map(|m| m.as_str()).unwrap_or("").to_string();
485                    let version_constraint = caps.get(2).map(|m| m.as_str().to_string());
486                    let pinned = caps.get(3).is_some();
487
488                    if !name.is_empty() {
489                        // Update gem info if exists, or create new
490                        if let Some(gem) = gems.get_mut(&name) {
491                            gem.pinned = pinned;
492                            if let Some(vc) = &version_constraint {
493                                gem.requirements.push(vc.clone());
494                            }
495                        } else {
496                            let gem_info = GemInfo {
497                                name: name.clone(),
498                                version: None,
499                                platform: None,
500                                gem_type: "GEM".to_string(),
501                                remote: None,
502                                revision: None,
503                                ref_field: None,
504                                branch: None,
505                                tag: None,
506                                pinned,
507                                requirements: version_constraint.into_iter().collect(),
508                            };
509                            gems.insert(name, gem_info);
510                        }
511                    }
512                }
513            }
514            ParseState::None => {}
515        }
516    }
517
518    let primary_gem = gems.values().find(|gem| gem.gem_type == "PATH").cloned();
519
520    let (
521        package_name,
522        package_version,
523        repository_homepage_url,
524        repository_download_url,
525        api_data_url,
526        download_url,
527    ) = if let Some(ref pg) = primary_gem {
528        let urls = get_rubygems_urls(&pg.name, pg.version.as_deref(), pg.platform.as_deref());
529        (
530            Some(pg.name.clone()),
531            pg.version.clone(),
532            urls.0,
533            urls.1,
534            urls.2,
535            urls.3,
536        )
537    } else {
538        (None, None, None, None, None, None)
539    };
540
541    for (_, gem) in gems {
542        if let Some(ref pg) = primary_gem
543            && gem.name == pg.name
544        {
545            continue;
546        }
547
548        let version_for_purl = gem.version.as_deref();
549        let purl = create_gem_purl(&gem.name, version_for_purl);
550
551        let extracted_requirement = if !gem.requirements.is_empty() {
552            Some(gem.requirements.join(", "))
553        } else {
554            gem.version.clone()
555        };
556
557        let extra_data = build_gem_source_extra_data(&gem);
558
559        dependencies.push(Dependency {
560            purl,
561            extracted_requirement,
562            scope: Some("dependencies".to_string()),
563            is_runtime: Some(true),
564            is_optional: Some(false),
565            is_pinned: Some(gem.pinned),
566            is_direct: Some(true),
567            resolved_package: None,
568            extra_data,
569        });
570    }
571
572    dependencies.sort_by(|left, right| {
573        left.purl
574            .as_deref()
575            .cmp(&right.purl.as_deref())
576            .then_with(|| {
577                left.extracted_requirement
578                    .as_deref()
579                    .cmp(&right.extracted_requirement.as_deref())
580            })
581    });
582
583    // Build extra_data
584    let mut extra_data = HashMap::new();
585    if !platforms.is_empty() {
586        extra_data.insert(
587            "platforms".to_string(),
588            serde_json::Value::Array(
589                platforms
590                    .into_iter()
591                    .map(serde_json::Value::String)
592                    .collect(),
593            ),
594        );
595    }
596    if let Some(bv) = bundler_version {
597        extra_data.insert("bundler_version".to_string(), serde_json::Value::String(bv));
598    }
599
600    let purl = package_name
601        .as_deref()
602        .map(|n| create_gem_purl(n, package_version.as_deref()))
603        .unwrap_or(None);
604
605    PackageData {
606        package_type: Some(PACKAGE_TYPE),
607        name: package_name,
608        version: package_version,
609        primary_language: Some("Ruby".to_string()),
610        download_url,
611        dependencies,
612        repository_homepage_url,
613        repository_download_url,
614        api_data_url,
615        extra_data: if extra_data.is_empty() {
616            None
617        } else {
618            Some(extra_data)
619        },
620        datasource_id: Some(DatasourceId::GemfileLock),
621        purl,
622        ..default_package_data()
623    }
624}
625
626fn build_gem_source_extra_data(gem: &GemInfo) -> Option<HashMap<String, serde_json::Value>> {
627    if gem.gem_type != "GIT" && gem.gem_type != "PATH" && gem.gem_type != "SVN" {
628        return None;
629    }
630
631    let mut extra = HashMap::new();
632    extra.insert(
633        "source_type".to_string(),
634        serde_json::Value::String(gem.gem_type.clone()),
635    );
636
637    if let Some(ref remote) = gem.remote {
638        extra.insert(
639            "remote".to_string(),
640            serde_json::Value::String(remote.clone()),
641        );
642    }
643    if let Some(ref revision) = gem.revision {
644        extra.insert(
645            "revision".to_string(),
646            serde_json::Value::String(revision.clone()),
647        );
648    }
649    if let Some(ref ref_field) = gem.ref_field {
650        extra.insert(
651            "ref".to_string(),
652            serde_json::Value::String(ref_field.clone()),
653        );
654    }
655    if let Some(ref branch) = gem.branch {
656        extra.insert(
657            "branch".to_string(),
658            serde_json::Value::String(branch.clone()),
659        );
660    }
661    if let Some(ref tag) = gem.tag {
662        extra.insert("tag".to_string(), serde_json::Value::String(tag.clone()));
663    }
664
665    Some(extra)
666}
667
668/// Parses version and platform from a combined string.
669/// Examples: "2.6.3" -> ("2.6.3", None), "2.6.3-java" -> ("2.6.3", Some("java"))
670fn parse_version_platform(s: &str) -> (Option<String>, Option<String>) {
671    if s.is_empty() {
672        return (None, None);
673    }
674    if let Some(idx) = s.find('-') {
675        let version = &s[..idx];
676        let platform = &s[idx + 1..];
677        (Some(version.to_string()), Some(platform.to_string()))
678    } else {
679        (Some(s.to_string()), None)
680    }
681}
682
683/// Creates a gem PURL.
684fn create_gem_purl(name: &str, version: Option<&str>) -> Option<String> {
685    let mut purl = match PackageUrl::new(PACKAGE_TYPE.as_str(), name) {
686        Ok(p) => p,
687        Err(e) => {
688            warn!("Failed to create PURL for gem '{}': {}", name, e);
689            return None;
690        }
691    };
692
693    if let Some(v) = version
694        && let Err(e) = purl.with_version(v)
695    {
696        warn!("Failed to set version '{}' for gem '{}': {}", v, name, e);
697    }
698
699    Some(purl.to_string())
700}
701
702fn rubygems_homepage_url(name: &str, version: Option<&str>) -> Option<String> {
703    if name.is_empty() {
704        return None;
705    }
706
707    if let Some(v) = version {
708        let v = v.trim().trim_matches('/');
709        Some(format!("https://rubygems.org/gems/{}/versions/{}", name, v))
710    } else {
711        Some(format!("https://rubygems.org/gems/{}", name))
712    }
713}
714
715fn rubygems_download_url(
716    name: &str,
717    version: Option<&str>,
718    platform: Option<&str>,
719) -> Option<String> {
720    if name.is_empty() || version.is_none() {
721        return None;
722    }
723
724    let name = name.trim().trim_matches('/');
725    let version = version?.trim().trim_matches('/');
726
727    let version_plat = if let Some(p) = platform {
728        if p != "ruby" {
729            format!("{}-{}", version, p)
730        } else {
731            version.to_string()
732        }
733    } else {
734        version.to_string()
735    };
736
737    Some(format!(
738        "https://rubygems.org/downloads/{}-{}.gem",
739        name, version_plat
740    ))
741}
742
743fn rubygems_api_url(name: &str, version: Option<&str>) -> Option<String> {
744    if name.is_empty() {
745        return None;
746    }
747
748    if let Some(v) = version {
749        Some(format!(
750            "https://rubygems.org/api/v2/rubygems/{}/versions/{}.json",
751            name, v
752        ))
753    } else {
754        Some(format!(
755            "https://rubygems.org/api/v1/versions/{}.json",
756            name
757        ))
758    }
759}
760
761fn get_rubygems_urls(
762    name: &str,
763    version: Option<&str>,
764    platform: Option<&str>,
765) -> (
766    Option<String>,
767    Option<String>,
768    Option<String>,
769    Option<String>,
770) {
771    let repository_homepage_url = rubygems_homepage_url(name, version);
772    let repository_download_url = rubygems_download_url(name, version, platform);
773    let api_data_url = rubygems_api_url(name, version);
774    let download_url = repository_download_url.clone();
775
776    (
777        repository_homepage_url,
778        repository_download_url,
779        api_data_url,
780        download_url,
781    )
782}
783
784/// Returns a default PackageData with gem-specific settings.
785fn default_package_data() -> PackageData {
786    PackageData {
787        package_type: Some(PACKAGE_TYPE),
788        primary_language: Some("Ruby".to_string()),
789        ..Default::default()
790    }
791}
792
793fn default_package_data_with_datasource(datasource_id: DatasourceId) -> PackageData {
794    PackageData {
795        datasource_id: Some(datasource_id),
796        ..default_package_data()
797    }
798}
799
800// =============================================================================
801// Gemspec Parser (Ruby DSL)
802// =============================================================================
803
804/// Ruby .gemspec file parser.
805///
806/// Parses `Gem::Specification.new` blocks using regex-based extraction.
807/// Handles frozen strings (Bug #1), variable version resolution (Bug #2),
808/// and RFC 5322 email parsing (Bug #6).
809pub struct GemspecParser;
810
811impl PackageParser for GemspecParser {
812    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
813
814    fn extract_packages(path: &Path) -> Vec<PackageData> {
815        let content = match fs::read_to_string(path) {
816            Ok(c) => c,
817            Err(e) => {
818                warn!("Failed to read .gemspec at {:?}: {}", path, e);
819                return vec![default_package_data_with_datasource(DatasourceId::Gemspec)];
820            }
821        };
822
823        vec![parse_gemspec_with_context(&content, path.parent())]
824    }
825
826    fn is_match(path: &Path) -> bool {
827        path.extension()
828            .and_then(|ext| ext.to_str())
829            .is_some_and(|ext| ext == "gemspec")
830    }
831}
832
833/// Cleans a value extracted from gemspec by stripping quotes, .freeze, %q{}, and brackets.
834fn clean_gemspec_value(s: &str) -> String {
835    let s = strip_freeze_suffix(s).trim();
836
837    let s = if let Some(pos) = s.find(" #") {
838        s[..pos].trim()
839    } else {
840        s
841    };
842
843    let s = if let Some(stripped) = s.strip_prefix("%q{") {
844        stripped.strip_suffix('}').unwrap_or(stripped)
845    } else if let Some(stripped) = s.strip_prefix("%q<") {
846        stripped.strip_suffix('>').unwrap_or(stripped)
847    } else if let Some(stripped) = s.strip_prefix("%q[") {
848        stripped.strip_suffix(']').unwrap_or(stripped)
849    } else if let Some(stripped) = s.strip_prefix("%q(") {
850        stripped.strip_suffix(')').unwrap_or(stripped)
851    } else {
852        s
853    };
854
855    let s = s
856        .trim_start_matches('"')
857        .trim_end_matches('"')
858        .trim_start_matches('\'')
859        .trim_end_matches('\'');
860    let s = strip_freeze_suffix(s).trim();
861    s.to_string()
862}
863
864/// Extracts items from a Ruby array literal like `["a", "b", "c"]`.
865fn extract_ruby_array(s: &str) -> Vec<String> {
866    let s = strip_freeze_suffix(s.trim());
867    let s = s.trim_start_matches('[').trim_end_matches(']');
868    let item_re = match Regex::new(r#"["']([^"']*?)["'](?:\.freeze)?"#) {
869        Ok(r) => r,
870        Err(_) => return Vec::new(),
871    };
872    item_re
873        .captures_iter(s)
874        .filter_map(|cap| cap.get(1).map(|m| m.as_str().to_string()))
875        .collect()
876}
877
878fn extract_all_ruby_values(s: &str) -> Vec<String> {
879    let value_re = match Regex::new(r#"%q[\{<\[(]([^\}>\])]+)[\}>\])]|["']([^"']+)["']"#) {
880        Ok(r) => r,
881        Err(_) => return Vec::new(),
882    };
883
884    value_re
885        .captures_iter(s)
886        .filter_map(|caps| caps.get(1).or_else(|| caps.get(2)))
887        .map(|m| clean_gemspec_value(m.as_str()))
888        .collect()
889}
890
891fn extract_first_ruby_value(s: &str) -> Option<String> {
892    extract_all_ruby_values(s).into_iter().next()
893}
894
895fn after_first_argument(args: &str) -> &str {
896    let mut bracket_depth = 0usize;
897    let mut paren_depth = 0usize;
898    let mut in_quote: Option<char> = None;
899    let chars: Vec<(usize, char)> = args.char_indices().collect();
900    let mut i = 0;
901
902    while i < chars.len() {
903        let (idx, ch) = chars[i];
904
905        if let Some(quote) = in_quote {
906            if ch == '\\' {
907                i += 2;
908                continue;
909            }
910            if ch == quote {
911                in_quote = None;
912            }
913            i += 1;
914            continue;
915        }
916
917        match ch {
918            '\'' | '"' => in_quote = Some(ch),
919            '[' | '{' | '<' => bracket_depth += 1,
920            ']' | '}' | '>' => bracket_depth = bracket_depth.saturating_sub(1),
921            '(' => paren_depth += 1,
922            ')' => paren_depth = paren_depth.saturating_sub(1),
923            ',' if bracket_depth == 0 && paren_depth == 0 => return args[idx + 1..].trim(),
924            _ => {}
925        }
926
927        i += 1;
928    }
929
930    ""
931}
932
933/// Bug #2: Resolves variable version references like `CSV::VERSION` or `RAILS_VERSION`.
934///
935/// Scans the file content for constant definitions matching the variable name
936/// and returns the resolved string value.
937fn resolve_variable_version(var_name: &str, contexts: &[String]) -> Option<String> {
938    let var_name = var_name.trim();
939    if var_name.is_empty() {
940        return None;
941    }
942
943    for candidate in candidate_constant_names(var_name) {
944        let escaped = regex::escape(&candidate);
945        let pattern = format!(r#"(?m)^\s*{}\s*=\s*["']([^"']+)["']"#, escaped);
946        let Ok(re) = Regex::new(&pattern) else {
947            continue;
948        };
949
950        for context in contexts {
951            if let Some(caps) = re.captures(context) {
952                return caps.get(1).map(|m| m.as_str().to_string());
953            }
954        }
955    }
956
957    None
958}
959
960fn resolve_variable_array(var_name: &str, contexts: &[String]) -> Option<Vec<String>> {
961    let var_name = var_name.trim();
962    if var_name.is_empty() {
963        return None;
964    }
965
966    for candidate in candidate_constant_names(var_name) {
967        let escaped = regex::escape(&candidate);
968        let pattern = format!(r#"(?m)^\s*{}\s*=\s*(\[[^\n]+\])"#, escaped);
969        let Ok(re) = Regex::new(&pattern) else {
970            continue;
971        };
972
973        for context in contexts {
974            if let Some(caps) = re.captures(context)
975                && let Some(raw) = caps.get(1)
976            {
977                let values = extract_ruby_array(raw.as_str());
978                if !values.is_empty() {
979                    return Some(values);
980                }
981            }
982        }
983    }
984
985    None
986}
987
988fn candidate_constant_names(var_name: &str) -> Vec<String> {
989    let mut names = vec![var_name.to_string()];
990    if let Some(last) = var_name.split("::").last()
991        && last != var_name
992    {
993        names.push(last.to_string());
994    }
995    names
996}
997
998fn load_required_ruby_contexts(content: &str, base_dir: Option<&Path>) -> Vec<String> {
999    let mut contexts = vec![content.to_string()];
1000    let Some(base_dir) = base_dir else {
1001        return contexts;
1002    };
1003
1004    let require_re = match Regex::new(r#"(?m)^\s*require(?:_relative)?\s+["']([^"']+)["']"#) {
1005        Ok(re) => re,
1006        Err(_) => return contexts,
1007    };
1008
1009    for caps in require_re.captures_iter(content) {
1010        let Some(required) = caps.get(1).map(|m| m.as_str()) else {
1011            continue;
1012        };
1013        for candidate in candidate_require_paths(base_dir, required) {
1014            if let Ok(required_content) = fs::read_to_string(&candidate) {
1015                contexts.push(required_content);
1016                break;
1017            }
1018        }
1019    }
1020
1021    contexts
1022}
1023
1024fn candidate_require_paths(base_dir: &Path, required: &str) -> Vec<PathBuf> {
1025    let relative = required.replace("::", "/");
1026    let filename = if relative.ends_with(".rb") {
1027        relative
1028    } else {
1029        format!("{}.rb", relative)
1030    };
1031
1032    vec![
1033        base_dir.join(&filename),
1034        base_dir.join("lib").join(&filename),
1035    ]
1036}
1037
1038fn looks_like_constant_reference(s: &str) -> bool {
1039    s.contains("::") || s.chars().next().is_some_and(|c| c.is_ascii_uppercase())
1040}
1041
1042/// Parses a .gemspec file content and returns PackageData.
1043#[cfg(test)]
1044fn parse_gemspec(content: &str) -> PackageData {
1045    parse_gemspec_with_context(content, None)
1046}
1047
1048fn parse_gemspec_with_context(content: &str, base_dir: Option<&Path>) -> PackageData {
1049    let contexts = load_required_ruby_contexts(content, base_dir);
1050
1051    // Regex for spec.name = "value" or s.name = "value"
1052    // The spec variable name varies: spec, s, gem, etc.
1053    let field_re = match Regex::new(
1054        r#"(?m)^\s*\w+\.(name|version|summary|description|homepage|license)\s*=\s*(.+)$"#,
1055    ) {
1056        Ok(r) => r,
1057        Err(e) => {
1058            warn!("Failed to compile gemspec field regex: {}", e);
1059            return default_package_data_with_datasource(DatasourceId::Gemspec);
1060        }
1061    };
1062
1063    let licenses_re = match Regex::new(r#"(?m)^\s*\w+\.licenses\s*=\s*(.+)$"#) {
1064        Ok(r) => r,
1065        Err(e) => {
1066            warn!("Failed to compile licenses regex: {}", e);
1067            return default_package_data_with_datasource(DatasourceId::Gemspec);
1068        }
1069    };
1070
1071    let authors_re = match Regex::new(r#"(?m)^\s*\w+\.(?:authors|author)\s*=\s*(.+)$"#) {
1072        Ok(r) => r,
1073        Err(e) => {
1074            warn!("Failed to compile authors regex: {}", e);
1075            return default_package_data_with_datasource(DatasourceId::Gemspec);
1076        }
1077    };
1078
1079    let email_re = match Regex::new(r#"(?m)^\s*\w+\.email\s*=\s*(.+)$"#) {
1080        Ok(r) => r,
1081        Err(e) => {
1082            warn!("Failed to compile email regex: {}", e);
1083            return default_package_data_with_datasource(DatasourceId::Gemspec);
1084        }
1085    };
1086
1087    let dependency_call_re = match Regex::new(
1088        r#"(?m)^\s*\w+\.(add_(?:development_|runtime_)?dependency)\s*\(?(.+?)\)?\s*$"#,
1089    ) {
1090        Ok(r) => r,
1091        Err(e) => {
1092            warn!("Failed to compile gemspec dependency regex: {}", e);
1093            return default_package_data_with_datasource(DatasourceId::Gemspec);
1094        }
1095    };
1096
1097    let mut name: Option<String> = None;
1098    let mut version: Option<String> = None;
1099    let mut summary: Option<String> = None;
1100    let mut description: Option<String> = None;
1101    let mut homepage: Option<String> = None;
1102    let mut license: Option<String> = None;
1103    let mut licenses: Vec<String> = Vec::new();
1104    let mut authors: Vec<String> = Vec::new();
1105    let mut emails: Vec<String> = Vec::new();
1106    let mut dependencies: Vec<Dependency> = Vec::new();
1107
1108    // Extract basic fields
1109    for caps in field_re.captures_iter(content) {
1110        let field_name = match caps.get(1) {
1111            Some(m) => m.as_str(),
1112            None => continue,
1113        };
1114        let raw_value = match caps.get(2) {
1115            Some(m) => m.as_str().trim(),
1116            None => continue,
1117        };
1118
1119        match field_name {
1120            "name" => {
1121                let cleaned = clean_gemspec_value(raw_value);
1122                name = if looks_like_constant_reference(&cleaned) {
1123                    resolve_variable_version(&cleaned, &contexts).or(Some(cleaned))
1124                } else {
1125                    Some(cleaned)
1126                }
1127            }
1128            "version" => {
1129                let cleaned = clean_gemspec_value(raw_value);
1130                // Bug #2: Check if version is a variable reference
1131                if looks_like_constant_reference(&cleaned) {
1132                    version = resolve_variable_version(&cleaned, &contexts).or(Some(cleaned));
1133                } else {
1134                    version = Some(cleaned);
1135                }
1136            }
1137            "summary" => {
1138                let cleaned = clean_gemspec_value(raw_value);
1139                summary = if looks_like_constant_reference(&cleaned) {
1140                    resolve_variable_version(&cleaned, &contexts).or(Some(cleaned))
1141                } else {
1142                    Some(cleaned)
1143                }
1144            }
1145            "description" => description = Some(clean_gemspec_value(raw_value)),
1146            "homepage" => {
1147                let cleaned = clean_gemspec_value(raw_value);
1148                homepage = if looks_like_constant_reference(&cleaned) {
1149                    resolve_variable_version(&cleaned, &contexts).or(Some(cleaned))
1150                } else {
1151                    Some(cleaned)
1152                }
1153            }
1154            "license" => license = Some(clean_gemspec_value(raw_value)),
1155            _ => {}
1156        }
1157    }
1158
1159    // Extract licenses (plural)
1160    for caps in licenses_re.captures_iter(content) {
1161        if let Some(raw) = caps.get(1) {
1162            licenses = extract_ruby_array(raw.as_str());
1163        }
1164    }
1165
1166    // Extract authors
1167    for caps in authors_re.captures_iter(content) {
1168        if let Some(raw) = caps.get(1) {
1169            let raw_str = raw.as_str().trim();
1170            if raw_str.starts_with('[') {
1171                authors = extract_ruby_array(raw_str);
1172            } else if looks_like_constant_reference(raw_str) {
1173                authors = resolve_variable_array(raw_str, &contexts)
1174                    .unwrap_or_else(|| vec![clean_gemspec_value(raw_str)]);
1175            } else {
1176                authors.push(clean_gemspec_value(raw_str));
1177            }
1178        }
1179    }
1180
1181    // Extract emails
1182    for caps in email_re.captures_iter(content) {
1183        if let Some(raw) = caps.get(1) {
1184            let raw_str = raw.as_str().trim();
1185            if raw_str.starts_with('[') {
1186                emails = extract_ruby_array(raw_str);
1187            } else if looks_like_constant_reference(raw_str) {
1188                emails = resolve_variable_array(raw_str, &contexts)
1189                    .unwrap_or_else(|| vec![clean_gemspec_value(raw_str)]);
1190            } else {
1191                emails.push(clean_gemspec_value(raw_str));
1192            }
1193        }
1194    }
1195
1196    // Build parties from authors and emails
1197    let mut parties: Vec<Party> = Vec::new();
1198
1199    if authors.len() == 1 && emails.len() == 1 {
1200        let email_str = emails.first().map(String::as_str);
1201        let (parsed_email_name, parsed_email) = match email_str {
1202            Some(e) => split_name_email(e),
1203            None => (None, None),
1204        };
1205
1206        parties.push(Party {
1207            r#type: Some("person".to_string()),
1208            role: Some("author".to_string()),
1209            name: authors.first().cloned().or(parsed_email_name),
1210            email: parsed_email.or_else(|| {
1211                email_str
1212                    .filter(|e| e.contains('@') && !e.contains('<'))
1213                    .map(|e| e.to_string())
1214            }),
1215            url: None,
1216            organization: None,
1217            organization_url: None,
1218            timezone: None,
1219        });
1220    } else {
1221        for author_name in authors {
1222            parties.push(Party {
1223                r#type: Some("person".to_string()),
1224                role: Some("author".to_string()),
1225                name: Some(author_name),
1226                email: None,
1227                url: None,
1228                organization: None,
1229                organization_url: None,
1230                timezone: None,
1231            });
1232        }
1233
1234        for email_str in emails {
1235            let (parsed_email_name, parsed_email) = if email_str.contains('<') {
1236                split_name_email(&email_str)
1237            } else {
1238                (None, None)
1239            };
1240            parties.push(Party {
1241                r#type: Some("person".to_string()),
1242                role: Some("author".to_string()),
1243                name: parsed_email_name,
1244                email: parsed_email.or_else(|| email_str.contains('@').then_some(email_str)),
1245                url: None,
1246                organization: None,
1247                organization_url: None,
1248                timezone: None,
1249            });
1250        }
1251    }
1252
1253    for caps in dependency_call_re.captures_iter(content) {
1254        let method = match caps.get(1) {
1255            Some(m) => m.as_str(),
1256            None => continue,
1257        };
1258        let args = match caps.get(2) {
1259            Some(m) => m.as_str(),
1260            None => continue,
1261        };
1262
1263        let Some(dep_name) = extract_first_ruby_value(args) else {
1264            continue;
1265        };
1266        let version_parts = extract_all_ruby_values(after_first_argument(args));
1267        let extracted_requirement = if version_parts.is_empty() {
1268            None
1269        } else {
1270            Some(version_parts.join(", "))
1271        };
1272        let purl = create_gem_purl(&dep_name, None);
1273        let is_development = method == "add_development_dependency";
1274        let scope = if is_development {
1275            "development"
1276        } else {
1277            "runtime"
1278        };
1279
1280        dependencies.push(Dependency {
1281            purl,
1282            extracted_requirement,
1283            scope: Some(scope.to_string()),
1284            is_runtime: Some(!is_development),
1285            is_optional: Some(is_development),
1286            is_pinned: None,
1287            is_direct: Some(true),
1288            resolved_package: None,
1289            extra_data: None,
1290        });
1291    }
1292
1293    // Extract license statement only - detection happens in separate engine
1294    let extracted_license_statement = if !licenses.is_empty() {
1295        Some(licenses.join(" AND "))
1296    } else {
1297        license
1298    };
1299
1300    let declared_license_expression = None;
1301    let declared_license_expression_spdx = None;
1302
1303    // Prefer description over summary
1304    let final_description = description.or(summary);
1305
1306    // Build PURL
1307    let purl = name
1308        .as_deref()
1309        .map(|n| create_gem_purl(n, version.as_deref()))
1310        .unwrap_or(None);
1311
1312    let (repository_homepage_url, repository_download_url, api_data_url, download_url) =
1313        if let Some(n) = name.as_deref() {
1314            get_rubygems_urls(n, version.as_deref(), None)
1315        } else {
1316            (None, None, None, None)
1317        };
1318
1319    PackageData {
1320        package_type: Some(PACKAGE_TYPE),
1321        name,
1322        version,
1323        primary_language: Some("Ruby".to_string()),
1324        description: final_description,
1325        homepage_url: homepage,
1326        download_url,
1327        declared_license_expression,
1328        declared_license_expression_spdx,
1329        extracted_license_statement,
1330        parties,
1331        dependencies,
1332        repository_homepage_url,
1333        repository_download_url,
1334        api_data_url,
1335        datasource_id: Some(DatasourceId::Gemspec),
1336        purl,
1337        ..default_package_data()
1338    }
1339}
1340
1341// =============================================================================
1342// .gem Archive Parser (Wave 3)
1343// =============================================================================
1344
1345const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; // 100MB
1346const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; // 50MB per file
1347const MAX_COMPRESSION_RATIO: f64 = 100.0; // 100:1 ratio
1348
1349/// Parser for .gem archive files.
1350///
1351/// Extracts metadata from Ruby .gem packages, which are tar archives
1352/// containing a gzip-compressed YAML metadata file (`metadata.gz`).
1353///
1354/// Includes safety checks against zip bombs and oversized archives.
1355pub struct GemArchiveParser;
1356
1357impl PackageParser for GemArchiveParser {
1358    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
1359
1360    fn extract_packages(path: &Path) -> Vec<PackageData> {
1361        vec![match extract_gem_archive(path) {
1362            Ok(data) => data,
1363            Err(e) => {
1364                warn!("Failed to extract .gem archive at {:?}: {}", path, e);
1365                default_package_data_with_datasource(DatasourceId::GemArchive)
1366            }
1367        }]
1368    }
1369
1370    fn is_match(path: &Path) -> bool {
1371        path.extension()
1372            .and_then(|ext| ext.to_str())
1373            .is_some_and(|ext| ext == "gem")
1374    }
1375}
1376
1377fn extract_gem_archive(path: &Path) -> Result<PackageData, String> {
1378    let file_metadata =
1379        fs::metadata(path).map_err(|e| format!("Failed to read file metadata: {}", e))?;
1380    let archive_size = file_metadata.len();
1381
1382    if archive_size > MAX_ARCHIVE_SIZE {
1383        return Err(format!(
1384            "Archive too large: {} bytes (limit: {} bytes)",
1385            archive_size, MAX_ARCHIVE_SIZE
1386        ));
1387    }
1388
1389    let file = File::open(path).map_err(|e| format!("Failed to open archive: {}", e))?;
1390    let mut archive = Archive::new(file);
1391
1392    for entry_result in archive
1393        .entries()
1394        .map_err(|e| format!("Failed to read tar entries: {}", e))?
1395    {
1396        let entry = entry_result.map_err(|e| format!("Failed to read tar entry: {}", e))?;
1397        let entry_path = entry
1398            .path()
1399            .map_err(|e| format!("Failed to get entry path: {}", e))?;
1400
1401        if entry_path.to_str() == Some("metadata.gz") {
1402            let entry_size = entry.size();
1403            if entry_size > MAX_FILE_SIZE {
1404                return Err(format!(
1405                    "metadata.gz too large: {} bytes (limit: {} bytes)",
1406                    entry_size, MAX_FILE_SIZE
1407                ));
1408            }
1409
1410            let mut decoder = GzDecoder::new(entry);
1411            let mut content = String::new();
1412            decoder
1413                .read_to_string(&mut content)
1414                .map_err(|e| format!("Failed to decompress metadata.gz: {}", e))?;
1415
1416            let uncompressed_size = content.len() as u64;
1417            if entry_size > 0 {
1418                let ratio = uncompressed_size as f64 / entry_size as f64;
1419                if ratio > MAX_COMPRESSION_RATIO {
1420                    return Err(format!(
1421                        "Suspicious compression ratio: {:.2}:1 (limit: {:.0}:1)",
1422                        ratio, MAX_COMPRESSION_RATIO
1423                    ));
1424                }
1425            }
1426            if uncompressed_size > MAX_FILE_SIZE {
1427                return Err(format!(
1428                    "Decompressed metadata too large: {} bytes (limit: {} bytes)",
1429                    uncompressed_size, MAX_FILE_SIZE
1430                ));
1431            }
1432
1433            return parse_gem_metadata_yaml(&content, DatasourceId::GemArchive);
1434        }
1435    }
1436
1437    Err("metadata.gz not found in .gem archive".to_string())
1438}
1439
1440fn parse_gem_metadata_yaml(
1441    content: &str,
1442    datasource_id: DatasourceId,
1443) -> Result<PackageData, String> {
1444    // Ruby YAML tagged types need to be handled:
1445    // --- !ruby/object:Gem::Specification
1446    // We strip Ruby-specific YAML tags since serde_yaml can't handle them
1447    let cleaned = clean_ruby_yaml_tags(content);
1448
1449    let yaml: serde_yaml::Value =
1450        serde_yaml::from_str(&cleaned).map_err(|e| format!("Failed to parse YAML: {}", e))?;
1451
1452    let name = yaml_string(&yaml, "name");
1453    let version = yaml.get("version").and_then(|v| {
1454        // version can be a simple string or a mapping with a "version" key
1455        if v.is_string() {
1456            v.as_str().map(|s| s.to_string())
1457        } else {
1458            yaml_string(v, "version")
1459        }
1460    });
1461    let description = yaml_string(&yaml, "description").or_else(|| yaml_string(&yaml, "summary"));
1462    let homepage = yaml_string(&yaml, "homepage");
1463    let summary = yaml_string(&yaml, "summary");
1464
1465    // Licenses
1466    let licenses: Vec<String> = yaml
1467        .get("licenses")
1468        .and_then(|v| v.as_sequence())
1469        .map(|seq| {
1470            seq.iter()
1471                .filter_map(|item| item.as_str().map(|s| s.to_string()))
1472                .collect()
1473        })
1474        .unwrap_or_default();
1475
1476    // Extract license statement only - detection happens in separate engine
1477    let extracted_license_statement = if !licenses.is_empty() {
1478        Some(licenses.join(" AND "))
1479    } else {
1480        None
1481    };
1482
1483    let license_expression = None;
1484    let license_expression_spdx = None;
1485
1486    // Authors
1487    let authors: Vec<String> = yaml
1488        .get("authors")
1489        .and_then(|v| v.as_sequence())
1490        .map(|seq| {
1491            seq.iter()
1492                .filter_map(|item| item.as_str().map(|s| s.to_string()))
1493                .collect()
1494        })
1495        .unwrap_or_default();
1496
1497    let emails: Vec<String> = yaml
1498        .get("email")
1499        .map(|v| {
1500            if let Some(seq) = v.as_sequence() {
1501                seq.iter()
1502                    .filter_map(|item| item.as_str().map(|s| s.to_string()))
1503                    .collect()
1504            } else if let Some(s) = v.as_str() {
1505                vec![s.to_string()]
1506            } else {
1507                Vec::new()
1508            }
1509        })
1510        .unwrap_or_default();
1511
1512    // Build parties
1513    let mut parties: Vec<Party> = Vec::new();
1514    let max_len = authors.len().max(emails.len());
1515    for i in 0..max_len {
1516        let author_name = authors.get(i).map(|s| s.as_str());
1517        let email_str = emails.get(i).map(|s| s.as_str());
1518
1519        let (parsed_email_name, parsed_email) = match email_str {
1520            Some(e) if e.contains('<') => split_name_email(e),
1521            None => (None, None),
1522            _ => (None, None),
1523        };
1524
1525        let party_name = author_name.map(|s| s.to_string()).or(parsed_email_name);
1526
1527        parties.push(Party {
1528            r#type: Some("person".to_string()),
1529            role: Some("author".to_string()),
1530            name: party_name,
1531            email: parsed_email.or_else(|| {
1532                email_str
1533                    .filter(|e| e.contains('@') && !e.contains('<'))
1534                    .map(|e| e.to_string())
1535            }),
1536            url: None,
1537            organization: None,
1538            organization_url: None,
1539            timezone: None,
1540        });
1541    }
1542
1543    // Dependencies
1544    let dependencies = parse_gem_yaml_dependencies(&yaml);
1545
1546    let metadata = yaml.get("metadata");
1547
1548    let bug_tracking_url = metadata.and_then(|m| yaml_string(m, "bug_tracking_uri"));
1549
1550    let code_view_url = metadata.and_then(|m| yaml_string(m, "source_code_uri"));
1551
1552    let vcs_url = code_view_url
1553        .clone()
1554        .or_else(|| metadata.and_then(|m| yaml_string(m, "homepage_uri")));
1555
1556    let file_references = metadata
1557        .and_then(|m| m.get("files"))
1558        .and_then(|f| f.as_sequence())
1559        .map(|seq| {
1560            seq.iter()
1561                .filter_map(|v| v.as_str())
1562                .map(|s| crate::models::FileReference {
1563                    path: s.to_string(),
1564                    size: None,
1565                    sha1: None,
1566                    md5: None,
1567                    sha256: None,
1568                    sha512: None,
1569                    extra_data: None,
1570                })
1571                .collect::<Vec<_>>()
1572        })
1573        .unwrap_or_default();
1574
1575    let release_date = yaml_string(&yaml, "date").and_then(|d| {
1576        if d.len() >= 10 {
1577            Some(d[..10].to_string())
1578        } else {
1579            None
1580        }
1581    });
1582
1583    let purl = name
1584        .as_deref()
1585        .map(|n| create_gem_purl(n, version.as_deref()))
1586        .unwrap_or(None);
1587
1588    let platform = yaml_string(&yaml, "platform");
1589    let (repository_homepage_url, repository_download_url, api_data_url, download_url) =
1590        if let Some(n) = name.as_deref() {
1591            get_rubygems_urls(n, version.as_deref(), platform.as_deref())
1592        } else {
1593            (None, None, None, None)
1594        };
1595
1596    let qualifiers = if let Some(ref p) = platform {
1597        if p != "ruby" {
1598            let mut q = HashMap::new();
1599            q.insert("platform".to_string(), p.clone());
1600            Some(q)
1601        } else {
1602            None
1603        }
1604    } else {
1605        None
1606    };
1607
1608    Ok(PackageData {
1609        package_type: Some(PACKAGE_TYPE),
1610        name,
1611        version,
1612        qualifiers,
1613        primary_language: Some("Ruby".to_string()),
1614        description: description.or(summary),
1615        release_date,
1616        homepage_url: homepage,
1617        download_url,
1618        bug_tracking_url,
1619        code_view_url,
1620        declared_license_expression: license_expression,
1621        declared_license_expression_spdx: license_expression_spdx,
1622        extracted_license_statement,
1623        file_references,
1624        parties,
1625        dependencies,
1626        repository_homepage_url,
1627        repository_download_url,
1628        api_data_url,
1629        datasource_id: Some(datasource_id),
1630        purl,
1631        vcs_url,
1632        ..default_package_data()
1633    })
1634}
1635
1636/// Strips Ruby-specific YAML tags that serde_yaml cannot handle.
1637fn clean_ruby_yaml_tags(content: &str) -> String {
1638    let tag_re = match Regex::new(r"!ruby/\S+") {
1639        Ok(r) => r,
1640        Err(_) => return content.to_string(),
1641    };
1642    tag_re.replace_all(content, "").to_string()
1643}
1644
1645fn yaml_string(yaml: &serde_yaml::Value, key: &str) -> Option<String> {
1646    yaml.get(key)
1647        .and_then(|v| v.as_str())
1648        .filter(|s| !s.is_empty())
1649        .map(|s| s.to_string())
1650}
1651
1652fn parse_gem_yaml_dependencies(yaml: &serde_yaml::Value) -> Vec<Dependency> {
1653    let mut dependencies = Vec::new();
1654
1655    let deps_seq = match yaml.get("dependencies").and_then(|v| v.as_sequence()) {
1656        Some(seq) => seq,
1657        None => return dependencies,
1658    };
1659
1660    for dep_value in deps_seq {
1661        let dep_name = match yaml_string(dep_value, "name") {
1662            Some(n) => n,
1663            None => continue,
1664        };
1665
1666        let dep_type = yaml_string(dep_value, "type");
1667        let is_development = dep_type.as_deref() == Some(":development");
1668
1669        // Extract version requirements from the nested structure
1670        let requirements = dep_value
1671            .get("requirement")
1672            .or_else(|| dep_value.get("version_requirements"))
1673            .and_then(|req| req.get("requirements"))
1674            .and_then(|reqs| reqs.as_sequence());
1675
1676        let extracted_requirement = requirements.map(|reqs| {
1677            let parts: Vec<String> = reqs
1678                .iter()
1679                .filter_map(|req| {
1680                    let seq = req.as_sequence()?;
1681                    if seq.len() >= 2 {
1682                        let op = seq[0].as_str().unwrap_or("");
1683                        let ver = seq[1].get("version").and_then(|v| v.as_str()).unwrap_or("");
1684                        if op == ">=" && ver == "0" {
1685                            // ">= 0" means "any version" - skip
1686                            None
1687                        } else if op.is_empty() || ver.is_empty() {
1688                            None
1689                        } else {
1690                            Some(format!("{} {}", op, ver))
1691                        }
1692                    } else {
1693                        None
1694                    }
1695                })
1696                .collect();
1697            parts.join(", ")
1698        });
1699
1700        let extracted_requirement = extracted_requirement
1701            .filter(|s| !s.is_empty())
1702            .or_else(|| Some(String::new()));
1703
1704        let (scope, is_runtime, is_optional) = if is_development {
1705            (Some("development".to_string()), false, true)
1706        } else {
1707            (Some("runtime".to_string()), true, false)
1708        };
1709
1710        let purl = create_gem_purl(&dep_name, None);
1711
1712        dependencies.push(Dependency {
1713            purl,
1714            extracted_requirement,
1715            scope,
1716            is_runtime: Some(is_runtime),
1717            is_optional: Some(is_optional),
1718            is_pinned: None,
1719            is_direct: Some(true),
1720            resolved_package: None,
1721            extra_data: None,
1722        });
1723    }
1724
1725    dependencies
1726}
1727
1728// =============================================================================
1729// Gem Metadata Extracted Parser (metadata.gz-extract files)
1730// =============================================================================
1731
1732pub struct GemMetadataExtractedParser;
1733
1734impl PackageParser for GemMetadataExtractedParser {
1735    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
1736
1737    fn extract_packages(path: &Path) -> Vec<PackageData> {
1738        vec![match extract_gem_metadata_extracted(path) {
1739            Ok(data) => data,
1740            Err(e) => {
1741                warn!("Failed to extract gem metadata from {:?}: {}", path, e);
1742                default_package_data_with_datasource(DatasourceId::GemArchiveExtracted)
1743            }
1744        }]
1745    }
1746
1747    fn is_match(path: &Path) -> bool {
1748        path.to_str()
1749            .is_some_and(|p| p.contains("metadata.gz-extract"))
1750    }
1751}
1752
1753fn extract_gem_metadata_extracted(path: &Path) -> Result<PackageData, String> {
1754    let content = fs::read_to_string(path)
1755        .map_err(|e| format!("Failed to read metadata.gz-extract file: {}", e))?;
1756
1757    parse_gem_metadata_yaml(&content, DatasourceId::GemArchiveExtracted)
1758}
1759
1760// Register parser with metadata
1761crate::register_parser!(
1762    "Ruby Gemfile manifest",
1763    &["**/Gemfile", "**/data.gz-extract/Gemfile"],
1764    "gem",
1765    "Ruby",
1766    Some("https://bundler.io/man/gemfile.5.html"),
1767);
1768
1769crate::register_parser!(
1770    "Ruby Gemfile.lock lockfile",
1771    &["**/Gemfile.lock", "**/data.gz-extract/Gemfile.lock"],
1772    "gem",
1773    "Ruby",
1774    Some("https://bundler.io/man/gemfile.5.html"),
1775);
1776
1777crate::register_parser!(
1778    "Ruby .gemspec manifest",
1779    &[
1780        "**/*.gemspec",
1781        "**/data.gz-extract/*.gemspec",
1782        "**/specifications/*.gemspec"
1783    ],
1784    "gem",
1785    "Ruby",
1786    Some("https://guides.rubygems.org/specification-reference/"),
1787);
1788
1789crate::register_parser!(
1790    "Ruby .gem archive",
1791    &["**/*.gem"],
1792    "gem",
1793    "Ruby",
1794    Some("https://guides.rubygems.org/specification-reference/"),
1795);
1796
1797crate::register_parser!(
1798    "Ruby gem metadata (extracted)",
1799    &["**/metadata.gz-extract"],
1800    "gem",
1801    "Ruby",
1802    Some("https://guides.rubygems.org/specification-reference/"),
1803);
1804
1805#[cfg(test)]
1806mod tests {
1807    use super::parse_gemspec;
1808
1809    #[test]
1810    fn test_clean_gemspec_value_handles_unterminated_percent_q() {
1811        assert_eq!(
1812            super::clean_gemspec_value("%q{Arel is a SQL AST manager for Ruby. It"),
1813            "Arel is a SQL AST manager for Ruby. It"
1814        );
1815    }
1816
1817    #[test]
1818    fn test_parse_gemspec_runtime_dependency_scope() {
1819        let content = r#"
1820Gem::Specification.new do |spec|
1821  spec.name = "demo"
1822  spec.version = "1.0.0"
1823  spec.add_runtime_dependency "rack", "~> 3.0"
1824  spec.add_dependency "thor", ">= 1.0"
1825end
1826"#;
1827
1828        let package_data = parse_gemspec(content);
1829        assert_eq!(package_data.dependencies.len(), 2);
1830        assert_eq!(
1831            package_data.dependencies[0].scope,
1832            Some("runtime".to_string())
1833        );
1834        assert_eq!(
1835            package_data.dependencies[0].extracted_requirement,
1836            Some("~> 3.0".to_string())
1837        );
1838        assert_eq!(
1839            package_data.dependencies[1].scope,
1840            Some("runtime".to_string())
1841        );
1842        assert_eq!(
1843            package_data.dependencies[1].extracted_requirement,
1844            Some(">= 1.0".to_string())
1845        );
1846    }
1847}