Skip to main content

provenant/parsers/
ruby.rs

1//! Parser for Ruby/RubyGems package manifests.
2//!
3//! Extracts package metadata, dependencies, and platform information from
4//! Gemfile and Gemfile.lock files used by Ruby/Bundler projects.
5//!
6//! # Supported Formats
7//! - Gemfile (manifest with Ruby DSL)
8//! - Gemfile.lock (lockfile with state machine sections)
9//! - *.gemspec (gem specification files)
10//! - *.gem (gem archive packages)
11//! - metadata.gz-extract (pre-extracted gem metadata)
12//!
13//! # Key Features
14//! - State machine parsing for Gemfile.lock sections (GEM, GIT, PATH, SVN, PLATFORMS, BUNDLED WITH, DEPENDENCIES)
15//! - Regex-based Ruby DSL parsing for Gemfile
16//! - Dependency group handling (:development, :test, etc.)
17//! - Platform-specific gem support
18//! - Pessimistic version operator (~>) support
19//! - Bug Fix #1: Strip .freeze suffix from strings
20//! - Bug Fix #4: Correct dependency scope mapping (:runtime → None, :development → "development")
21//!
22//! # Implementation Notes
23//! - Uses regex for pattern matching (not full Ruby AST)
24//! - Graceful error handling: logs warnings and returns default on parse failure
25//! - PURL type: "gem"
26
27use crate::models::{DatasourceId, Dependency, PackageData, PackageType, Party};
28use crate::parser_warn as warn;
29use crate::parsers::utils::split_name_email;
30use flate2::read::GzDecoder;
31use packageurl::PackageUrl;
32use regex::Regex;
33use std::collections::HashMap;
34use std::fs::{self, File};
35use std::io::Read;
36use std::path::{Path, PathBuf};
37use tar::Archive;
38
39use super::PackageParser;
40use super::license_normalization::normalize_spdx_declared_license;
41
42const PACKAGE_TYPE: PackageType = PackageType::Gem;
43
44// =============================================================================
45// Bug Fix #1: Strip .freeze suffix from strings
46// =============================================================================
47
48/// Strips the `.freeze` suffix from Ruby frozen string literals.
49///
50/// In Ruby, `.freeze` makes a string immutable. We need to remove this suffix
51/// when parsing gem names and versions from Gemfile.
52///
53/// # Examples
54/// ```ignore
55/// assert_eq!(strip_freeze_suffix("\"name\".freeze"), "\"name\"");
56/// assert_eq!(strip_freeze_suffix("'1.0.0'.freeze"), "'1.0.0'");
57/// ```
58pub fn strip_freeze_suffix(s: &str) -> &str {
59    s.trim_end_matches(".freeze")
60}
61
62enum GemfileBlock {
63    Group(Vec<String>),
64    Source(String),
65}
66
67// =============================================================================
68// Gemfile Parser (Ruby DSL)
69// =============================================================================
70
71/// Ruby Gemfile parser for manifest files.
72///
73/// Parses Ruby DSL syntax to extract gem declarations, dependency groups,
74/// platform-specific gems, and version constraints.
75pub struct GemfileParser;
76
77impl PackageParser for GemfileParser {
78    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
79
80    fn extract_packages(path: &Path) -> Vec<PackageData> {
81        let datasource_id = gemfile_datasource_id(path);
82        let content = match fs::read_to_string(path) {
83            Ok(c) => c,
84            Err(e) => {
85                warn!("Failed to read Gemfile at {:?}: {}", path, e);
86                return vec![default_package_data_with_datasource(datasource_id)];
87            }
88        };
89
90        let mut package_data = parse_gemfile(&content);
91        package_data.datasource_id = Some(datasource_id);
92        vec![package_data]
93    }
94
95    fn is_match(path: &Path) -> bool {
96        path.file_name()
97            .and_then(|n| n.to_str())
98            .is_some_and(|name| name == "Gemfile")
99            || path
100                .to_str()
101                .is_some_and(|p| p.contains("data.gz-extract/") && p.ends_with("/Gemfile"))
102    }
103}
104
105/// Parses Gemfile content and extracts dependencies with groups.
106fn parse_gemfile(content: &str) -> PackageData {
107    let mut dependencies = Vec::new();
108    let mut block_stack = Vec::new();
109    let mut default_source = None;
110    let mut sources = Vec::new();
111
112    // Regex patterns for Gemfile parsing
113    // gem "name", "version", options...
114    let gem_regex = match Regex::new(
115        r#"^\s*gem\s+["']([^"']+)["'](?:\.freeze)?(?:\s*,\s*["']([^"']+)["'](?:\.freeze)?)?(?:\s*,\s*["']([^"']+)["'](?:\.freeze)?)?(?:\s*,\s*(.+))?"#,
116    ) {
117        Ok(r) => r,
118        Err(e) => {
119            warn!("Failed to compile gem regex: {}", e);
120            return default_package_data_with_datasource(DatasourceId::Gemfile);
121        }
122    };
123
124    // group :name do ... end
125    let group_start_regex = match Regex::new(r"^\s*group\s+(.+?)\s+do\s*$") {
126        Ok(r) => r,
127        Err(e) => {
128            warn!("Failed to compile group regex: {}", e);
129            return default_package_data_with_datasource(DatasourceId::Gemfile);
130        }
131    };
132
133    let group_end_regex = match Regex::new(r"^\s*end\s*$") {
134        Ok(r) => r,
135        Err(e) => {
136            warn!("Failed to compile end regex: {}", e);
137            return default_package_data_with_datasource(DatasourceId::Gemfile);
138        }
139    };
140
141    let source_block_start_regex = match Regex::new(r#"^\s*source\s+["']([^"']+)["']\s+do\s*$"#) {
142        Ok(r) => r,
143        Err(e) => {
144            warn!("Failed to compile source block regex: {}", e);
145            return default_package_data_with_datasource(DatasourceId::Gemfile);
146        }
147    };
148
149    let source_regex = match Regex::new(r#"^\s*source\s+["']([^"']+)["']\s*$"#) {
150        Ok(r) => r,
151        Err(e) => {
152            warn!("Failed to compile source regex: {}", e);
153            return default_package_data_with_datasource(DatasourceId::Gemfile);
154        }
155    };
156
157    // Parse symbols like :development, :test
158    let symbol_regex = match Regex::new(r":(\w+)") {
159        Ok(r) => r,
160        Err(e) => {
161            warn!("Failed to compile symbol regex: {}", e);
162            return default_package_data_with_datasource(DatasourceId::Gemfile);
163        }
164    };
165
166    for line in content.lines() {
167        let trimmed = line.trim();
168
169        // Skip comments and empty lines
170        if trimmed.is_empty() || trimmed.starts_with('#') {
171            continue;
172        }
173
174        // Check for group start
175        if let Some(caps) = group_start_regex.captures(trimmed) {
176            let groups_str = caps.get(1).map(|m| m.as_str()).unwrap_or("");
177            let mut current_groups = Vec::new();
178            for cap in symbol_regex.captures_iter(groups_str) {
179                if let Some(group_name) = cap.get(1) {
180                    current_groups.push(group_name.as_str().to_string());
181                }
182            }
183            block_stack.push(GemfileBlock::Group(current_groups));
184            continue;
185        }
186
187        if let Some(caps) = source_block_start_regex.captures(trimmed) {
188            let source = caps
189                .get(1)
190                .map(|m| m.as_str().to_string())
191                .unwrap_or_default();
192            if !source.is_empty() {
193                push_unique_string(&mut sources, source.clone());
194                block_stack.push(GemfileBlock::Source(source));
195            }
196            continue;
197        }
198
199        if let Some(caps) = source_regex.captures(trimmed) {
200            if let Some(source) = caps.get(1).map(|m| m.as_str().to_string()) {
201                push_unique_string(&mut sources, source.clone());
202                default_source = Some(source);
203            }
204            continue;
205        }
206
207        // Check for group end
208        if group_end_regex.is_match(trimmed) {
209            block_stack.pop();
210            continue;
211        }
212
213        // Parse gem declaration
214        if let Some(caps) = gem_regex.captures(trimmed) {
215            let name = strip_freeze_suffix(caps.get(1).map(|m| m.as_str()).unwrap_or(""));
216            if name.is_empty() {
217                continue;
218            }
219
220            // Collect version constraints
221            let mut version_parts = Vec::new();
222            if let Some(v) = caps.get(2) {
223                version_parts.push(strip_freeze_suffix(v.as_str()).to_string());
224            }
225            if let Some(v) = caps.get(3) {
226                let v_str = strip_freeze_suffix(v.as_str());
227                // Check if it looks like a version constraint
228                if looks_like_version_constraint(v_str) {
229                    version_parts.push(v_str.to_string());
230                }
231            }
232
233            let extracted_requirement = if version_parts.is_empty() {
234                None
235            } else {
236                Some(version_parts.join(", "))
237            };
238
239            let current_groups = current_group_names(&block_stack);
240
241            // Determine scope based on current group
242            // Bug Fix #4: :runtime → None, :development → "development"
243            let (scope, is_runtime, is_optional) = if current_groups.is_empty() {
244                // No group = runtime dependency
245                (None, true, false)
246            } else if current_groups.iter().any(|g| g == "development") {
247                (Some("development".to_string()), false, true)
248            } else if current_groups.iter().any(|g| g == "test") {
249                (Some("test".to_string()), false, true)
250            } else {
251                // Other groups (e.g., :production)
252                let group = current_groups.first().cloned();
253                (group, true, false)
254            };
255
256            // Create PURL
257            let purl = create_gem_purl(name, None);
258            let inherited_source = current_source(&block_stack, default_source.as_deref());
259            let extra_data = build_gemfile_dependency_extra_data(
260                caps.get(4).map(|m| m.as_str()),
261                inherited_source.as_deref(),
262            );
263
264            dependencies.push(Dependency {
265                purl,
266                extracted_requirement,
267                scope,
268                is_runtime: Some(is_runtime),
269                is_optional: Some(is_optional),
270                is_pinned: None,
271                is_direct: Some(true),
272                resolved_package: None,
273                extra_data,
274            });
275        }
276    }
277
278    let extra_data = if sources.is_empty() {
279        None
280    } else {
281        Some(HashMap::from([(
282            "sources".to_string(),
283            serde_json::Value::Array(sources.into_iter().map(serde_json::Value::String).collect()),
284        )]))
285    };
286
287    PackageData {
288        package_type: Some(PACKAGE_TYPE),
289        primary_language: Some("Ruby".to_string()),
290        dependencies,
291        extra_data,
292        datasource_id: Some(DatasourceId::Gemfile),
293        ..default_package_data()
294    }
295}
296
297fn current_group_names(block_stack: &[GemfileBlock]) -> Vec<String> {
298    block_stack
299        .iter()
300        .rev()
301        .find_map(|block| match block {
302            GemfileBlock::Group(groups) => Some(groups.clone()),
303            GemfileBlock::Source(_) => None,
304        })
305        .unwrap_or_default()
306}
307
308fn current_source(block_stack: &[GemfileBlock], default_source: Option<&str>) -> Option<String> {
309    block_stack
310        .iter()
311        .rev()
312        .find_map(|block| match block {
313            GemfileBlock::Source(source) => Some(source.clone()),
314            GemfileBlock::Group(_) => None,
315        })
316        .or_else(|| default_source.map(str::to_string))
317}
318
319fn push_unique_string(values: &mut Vec<String>, value: String) {
320    if !values.contains(&value) {
321        values.push(value);
322    }
323}
324
325fn build_gemfile_dependency_extra_data(
326    options: Option<&str>,
327    inherited_source: Option<&str>,
328) -> Option<HashMap<String, serde_json::Value>> {
329    let mut extra = HashMap::new();
330    let options = options.unwrap_or("");
331
332    if let Some(git) = extract_gemfile_quoted_option(options, "git") {
333        extra.insert(
334            "source_type".to_string(),
335            serde_json::Value::String("GIT".to_string()),
336        );
337        extra.insert("git".to_string(), serde_json::Value::String(git.clone()));
338        extra.insert("remote".to_string(), serde_json::Value::String(git));
339    }
340
341    if let Some(path) = extract_gemfile_quoted_option(options, "path") {
342        extra.insert(
343            "source_type".to_string(),
344            serde_json::Value::String("PATH".to_string()),
345        );
346        extra.insert("path".to_string(), serde_json::Value::String(path));
347    }
348
349    for key in ["branch", "ref", "tag"] {
350        if let Some(value) = extract_gemfile_quoted_option(options, key) {
351            extra.insert(key.to_string(), serde_json::Value::String(value));
352        }
353    }
354
355    let direct_source = extract_gemfile_quoted_option(options, "source");
356    if let Some(source) = direct_source {
357        extra.insert("source".to_string(), serde_json::Value::String(source));
358    } else if !extra.contains_key("source_type")
359        && let Some(source) = inherited_source
360    {
361        extra.insert(
362            "source".to_string(),
363            serde_json::Value::String(source.to_string()),
364        );
365    }
366
367    (!extra.is_empty()).then_some(extra)
368}
369
370fn extract_gemfile_quoted_option(options: &str, key: &str) -> Option<String> {
371    if options.is_empty() {
372        return None;
373    }
374
375    let pattern = format!(r#"(?:^|,\s*){}\s*:\s*["']([^"']+)["']"#, regex::escape(key));
376    Regex::new(&pattern)
377        .ok()
378        .and_then(|regex| regex.captures(options))
379        .and_then(|captures| captures.get(1).map(|m| m.as_str().to_string()))
380}
381
382/// Checks if a string looks like a version constraint.
383fn looks_like_version_constraint(s: &str) -> bool {
384    s.starts_with('~')
385        || s.starts_with('>')
386        || s.starts_with('<')
387        || s.starts_with('=')
388        || s.starts_with('!')
389        || s.chars().next().is_some_and(|c| c.is_ascii_digit())
390}
391
392// =============================================================================
393// Gemfile.lock Parser (State Machine)
394// =============================================================================
395
396/// Ruby Gemfile.lock parser for lockfiles.
397///
398/// Uses a state machine to parse sections: GEM, GIT, PATH, SVN,
399/// PLATFORMS, BUNDLED WITH, DEPENDENCIES.
400pub struct GemfileLockParser;
401
402impl PackageParser for GemfileLockParser {
403    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
404
405    fn extract_packages(path: &Path) -> Vec<PackageData> {
406        let datasource_id = gemfile_lock_datasource_id(path);
407        let content = match fs::read_to_string(path) {
408            Ok(c) => c,
409            Err(e) => {
410                warn!("Failed to read Gemfile.lock at {:?}: {}", path, e);
411                return vec![default_package_data_with_datasource(datasource_id)];
412            }
413        };
414
415        let mut package_data = parse_gemfile_lock(&content);
416        package_data.datasource_id = Some(datasource_id);
417        vec![package_data]
418    }
419
420    fn is_match(path: &Path) -> bool {
421        path.file_name()
422            .and_then(|n| n.to_str())
423            .is_some_and(|name| name == "Gemfile.lock")
424            || path
425                .to_str()
426                .is_some_and(|p| p.contains("data.gz-extract/") && p.ends_with("/Gemfile.lock"))
427    }
428}
429
430/// Parse state for Gemfile.lock state machine.
431#[derive(Debug, Clone, PartialEq)]
432enum ParseState {
433    None,
434    Gem,
435    Git,
436    Path,
437    Svn,
438    Specs,
439    Platforms,
440    BundledWith,
441    Dependencies,
442}
443
444/// Parsed gem information from Gemfile.lock.
445///
446/// All fields are actively used:
447/// - `gem_type`, `remote`, `revision`, `ref_field`, `branch`, `tag`: Stored in extra_data for GIT/PATH/SVN sources
448/// - `name`, `version`, `platform`, `pinned`: Used for dependency PURL and metadata generation
449/// - `requirements`: Stored as extracted_requirement for version constraints
450#[derive(Debug, Clone, Default)]
451struct GemInfo {
452    name: String,
453    version: Option<String>,
454    platform: Option<String>,
455    gem_type: String,
456    remote: Option<String>,
457    revision: Option<String>,
458    ref_field: Option<String>,
459    branch: Option<String>,
460    tag: Option<String>,
461    pinned: bool,
462    requirements: Vec<String>,
463}
464
465/// Parses Gemfile.lock content using a state machine.
466fn parse_gemfile_lock(content: &str) -> PackageData {
467    let mut state = ParseState::None;
468    let mut dependencies = Vec::new();
469    let mut gems: HashMap<String, GemInfo> = HashMap::new();
470    let mut platforms: Vec<String> = Vec::new();
471    let mut bundler_version: Option<String> = None;
472    let mut current_gem_type = String::new();
473    let mut current_remote: Option<String> = None;
474    let mut current_options: HashMap<String, String> = HashMap::new();
475
476    // DEPS pattern: 2 spaces at line start
477    let deps_regex = match Regex::new(r"^ {2}([^ \)\(,!:]+)(?: \(([^)]+)\))?(!)?$") {
478        Ok(r) => r,
479        Err(e) => {
480            warn!("Failed to compile deps regex: {}", e);
481            return default_package_data_with_datasource(DatasourceId::GemfileLock);
482        }
483    };
484
485    // SPEC_DEPS pattern: 4 spaces at line start
486    let spec_deps_regex = match Regex::new(r"^ {4}([^ \)\(,!:]+)(?: \(([^)]+)\))?$") {
487        Ok(r) => r,
488        Err(e) => {
489            warn!("Failed to compile spec_deps regex: {}", e);
490            return default_package_data_with_datasource(DatasourceId::GemfileLock);
491        }
492    };
493
494    // OPTIONS pattern: key: value
495    let options_regex = match Regex::new(r"^ {2}([a-z]+): (.+)$") {
496        Ok(r) => r,
497        Err(e) => {
498            warn!("Failed to compile options regex: {}", e);
499            return default_package_data_with_datasource(DatasourceId::GemfileLock);
500        }
501    };
502
503    // VERSION pattern for BUNDLED WITH
504    let version_regex = match Regex::new(r"^\s+(\d+(?:\.\d+)+)\s*$") {
505        Ok(r) => r,
506        Err(e) => {
507            warn!("Failed to compile version regex: {}", e);
508            return default_package_data_with_datasource(DatasourceId::GemfileLock);
509        }
510    };
511
512    for line in content.lines() {
513        let trimmed = line.trim_end();
514
515        // Empty line resets state
516        if trimmed.is_empty() {
517            current_options.clear();
518            continue;
519        }
520
521        // Section headers (no leading whitespace) and sub-section headers
522        match trimmed {
523            "GEM" => {
524                state = ParseState::Gem;
525                current_gem_type = "GEM".to_string();
526                current_remote = None;
527                current_options.clear();
528                continue;
529            }
530            "GIT" => {
531                state = ParseState::Git;
532                current_gem_type = "GIT".to_string();
533                current_remote = None;
534                current_options.clear();
535                continue;
536            }
537            "PATH" => {
538                state = ParseState::Path;
539                current_gem_type = "PATH".to_string();
540                current_remote = None;
541                current_options.clear();
542                continue;
543            }
544            "SVN" => {
545                state = ParseState::Svn;
546                current_gem_type = "SVN".to_string();
547                current_remote = None;
548                current_options.clear();
549                continue;
550            }
551            "PLATFORMS" => {
552                state = ParseState::Platforms;
553                continue;
554            }
555            "BUNDLED WITH" => {
556                state = ParseState::BundledWith;
557                continue;
558            }
559            "DEPENDENCIES" => {
560                state = ParseState::Dependencies;
561                continue;
562            }
563            _ => {}
564        }
565
566        // Check for "  specs:" sub-section header (2-space indent) within
567        // GEM/GIT/PATH/SVN sections. This must be checked separately because
568        // the leading whitespace is preserved by trim_end().
569        if trimmed.trim() == "specs:" {
570            state = match state {
571                ParseState::Gem | ParseState::Git | ParseState::Path | ParseState::Svn => {
572                    ParseState::Specs
573                }
574                _ => state,
575            };
576            continue;
577        }
578
579        // Process based on current state
580        match state {
581            ParseState::Gem | ParseState::Git | ParseState::Path | ParseState::Svn => {
582                // Parse options (remote:, revision:, ref:, branch:, tag:)
583                if let Some(caps) = options_regex.captures(line) {
584                    let key = caps.get(1).map(|m| m.as_str()).unwrap_or("");
585                    let value = caps.get(2).map(|m| m.as_str()).unwrap_or("");
586                    current_options.insert(key.to_string(), value.to_string());
587                    if key == "remote" {
588                        current_remote = Some(value.to_string());
589                    }
590                }
591            }
592            ParseState::Specs => {
593                // Parse gem specs (4 spaces indent)
594                if let Some(caps) = spec_deps_regex.captures(line) {
595                    let name = caps.get(1).map(|m| m.as_str()).unwrap_or("").to_string();
596                    let version_str = caps.get(2).map(|m| m.as_str()).unwrap_or("");
597
598                    // Parse version and platform
599                    let (version, platform) = parse_version_platform(version_str);
600
601                    if !name.is_empty() {
602                        let gem_info = GemInfo {
603                            name: name.clone(),
604                            version,
605                            platform,
606                            gem_type: current_gem_type.clone(),
607                            remote: current_remote.clone(),
608                            revision: current_options.get("revision").cloned(),
609                            ref_field: current_options.get("ref").cloned(),
610                            branch: current_options.get("branch").cloned(),
611                            tag: current_options.get("tag").cloned(),
612                            pinned: false,
613                            requirements: Vec::new(),
614                        };
615                        gems.insert(name, gem_info);
616                    }
617                }
618            }
619            ParseState::Platforms => {
620                // Parse platform entries (2 spaces indent)
621                let platform = trimmed.trim();
622                if !platform.is_empty() {
623                    platforms.push(platform.to_string());
624                }
625            }
626            ParseState::BundledWith => {
627                // Parse bundler version
628                if let Some(caps) = version_regex.captures(line) {
629                    bundler_version = caps.get(1).map(|m| m.as_str().to_string());
630                }
631            }
632            ParseState::Dependencies => {
633                // Parse direct dependencies (2 spaces indent)
634                if let Some(caps) = deps_regex.captures(line) {
635                    let name = caps.get(1).map(|m| m.as_str()).unwrap_or("").to_string();
636                    let version_constraint = caps.get(2).map(|m| m.as_str().to_string());
637                    let pinned = caps.get(3).is_some();
638
639                    if !name.is_empty() {
640                        // Update gem info if exists, or create new
641                        if let Some(gem) = gems.get_mut(&name) {
642                            gem.pinned = pinned;
643                            if let Some(vc) = &version_constraint {
644                                gem.requirements.push(vc.clone());
645                            }
646                        } else {
647                            let gem_info = GemInfo {
648                                name: name.clone(),
649                                version: None,
650                                platform: None,
651                                gem_type: "GEM".to_string(),
652                                remote: None,
653                                revision: None,
654                                ref_field: None,
655                                branch: None,
656                                tag: None,
657                                pinned,
658                                requirements: version_constraint.into_iter().collect(),
659                            };
660                            gems.insert(name, gem_info);
661                        }
662                    }
663                }
664            }
665            ParseState::None => {}
666        }
667    }
668
669    let primary_gem = gems.values().find(|gem| gem.gem_type == "PATH").cloned();
670
671    let (
672        package_name,
673        package_version,
674        repository_homepage_url,
675        repository_download_url,
676        api_data_url,
677        download_url,
678    ) = if let Some(ref pg) = primary_gem {
679        let urls = get_rubygems_urls(&pg.name, pg.version.as_deref(), pg.platform.as_deref());
680        (
681            Some(pg.name.clone()),
682            pg.version.clone(),
683            urls.0,
684            urls.1,
685            urls.2,
686            urls.3,
687        )
688    } else {
689        (None, None, None, None, None, None)
690    };
691
692    for (_, gem) in gems {
693        if let Some(ref pg) = primary_gem
694            && gem.name == pg.name
695        {
696            continue;
697        }
698
699        let version_for_purl = gem.version.as_deref();
700        let purl = create_gem_purl(&gem.name, version_for_purl);
701
702        let extracted_requirement = if !gem.requirements.is_empty() {
703            Some(gem.requirements.join(", "))
704        } else {
705            gem.version.clone()
706        };
707
708        let extra_data = build_gem_source_extra_data(&gem);
709
710        dependencies.push(Dependency {
711            purl,
712            extracted_requirement,
713            scope: Some("dependencies".to_string()),
714            is_runtime: Some(true),
715            is_optional: Some(false),
716            is_pinned: Some(gem.pinned),
717            is_direct: Some(true),
718            resolved_package: None,
719            extra_data,
720        });
721    }
722
723    dependencies.sort_by(|left, right| {
724        left.purl
725            .as_deref()
726            .cmp(&right.purl.as_deref())
727            .then_with(|| {
728                left.extracted_requirement
729                    .as_deref()
730                    .cmp(&right.extracted_requirement.as_deref())
731            })
732    });
733
734    // Build extra_data
735    let mut extra_data = HashMap::new();
736    if !platforms.is_empty() {
737        extra_data.insert(
738            "platforms".to_string(),
739            serde_json::Value::Array(
740                platforms
741                    .into_iter()
742                    .map(serde_json::Value::String)
743                    .collect(),
744            ),
745        );
746    }
747    if let Some(bv) = bundler_version {
748        extra_data.insert("bundler_version".to_string(), serde_json::Value::String(bv));
749    }
750
751    let purl = package_name
752        .as_deref()
753        .map(|n| create_gem_purl(n, package_version.as_deref()))
754        .unwrap_or(None);
755
756    PackageData {
757        package_type: Some(PACKAGE_TYPE),
758        name: package_name,
759        version: package_version,
760        primary_language: Some("Ruby".to_string()),
761        download_url,
762        dependencies,
763        repository_homepage_url,
764        repository_download_url,
765        api_data_url,
766        extra_data: if extra_data.is_empty() {
767            None
768        } else {
769            Some(extra_data)
770        },
771        datasource_id: Some(DatasourceId::GemfileLock),
772        purl,
773        ..default_package_data()
774    }
775}
776
777fn build_gem_source_extra_data(gem: &GemInfo) -> Option<HashMap<String, serde_json::Value>> {
778    if gem.gem_type != "GIT" && gem.gem_type != "PATH" && gem.gem_type != "SVN" {
779        return None;
780    }
781
782    let mut extra = HashMap::new();
783    extra.insert(
784        "source_type".to_string(),
785        serde_json::Value::String(gem.gem_type.clone()),
786    );
787
788    if let Some(ref remote) = gem.remote {
789        extra.insert(
790            "remote".to_string(),
791            serde_json::Value::String(remote.clone()),
792        );
793    }
794    if let Some(ref revision) = gem.revision {
795        extra.insert(
796            "revision".to_string(),
797            serde_json::Value::String(revision.clone()),
798        );
799    }
800    if let Some(ref ref_field) = gem.ref_field {
801        extra.insert(
802            "ref".to_string(),
803            serde_json::Value::String(ref_field.clone()),
804        );
805    }
806    if let Some(ref branch) = gem.branch {
807        extra.insert(
808            "branch".to_string(),
809            serde_json::Value::String(branch.clone()),
810        );
811    }
812    if let Some(ref tag) = gem.tag {
813        extra.insert("tag".to_string(), serde_json::Value::String(tag.clone()));
814    }
815
816    Some(extra)
817}
818
819/// Parses version and platform from a combined string.
820/// Examples: "2.6.3" -> ("2.6.3", None), "2.6.3-java" -> ("2.6.3", Some("java"))
821fn parse_version_platform(s: &str) -> (Option<String>, Option<String>) {
822    if s.is_empty() {
823        return (None, None);
824    }
825    if let Some(idx) = s.find('-') {
826        let version = &s[..idx];
827        let platform = &s[idx + 1..];
828        (Some(version.to_string()), Some(platform.to_string()))
829    } else {
830        (Some(s.to_string()), None)
831    }
832}
833
834/// Creates a gem PURL.
835fn create_gem_purl(name: &str, version: Option<&str>) -> Option<String> {
836    let mut purl = match PackageUrl::new(PACKAGE_TYPE.as_str(), name) {
837        Ok(p) => p,
838        Err(e) => {
839            warn!("Failed to create PURL for gem '{}': {}", name, e);
840            return None;
841        }
842    };
843
844    if let Some(v) = version
845        && let Err(e) = purl.with_version(v)
846    {
847        warn!("Failed to set version '{}' for gem '{}': {}", v, name, e);
848    }
849
850    Some(purl.to_string())
851}
852
853fn rubygems_homepage_url(name: &str, version: Option<&str>) -> Option<String> {
854    if name.is_empty() {
855        return None;
856    }
857
858    if let Some(v) = version {
859        let v = v.trim().trim_matches('/');
860        Some(format!("https://rubygems.org/gems/{}/versions/{}", name, v))
861    } else {
862        Some(format!("https://rubygems.org/gems/{}", name))
863    }
864}
865
866fn rubygems_download_url(
867    name: &str,
868    version: Option<&str>,
869    platform: Option<&str>,
870) -> Option<String> {
871    if name.is_empty() || version.is_none() {
872        return None;
873    }
874
875    let name = name.trim().trim_matches('/');
876    let version = version?.trim().trim_matches('/');
877
878    let version_plat = if let Some(p) = platform {
879        if p != "ruby" {
880            format!("{}-{}", version, p)
881        } else {
882            version.to_string()
883        }
884    } else {
885        version.to_string()
886    };
887
888    Some(format!(
889        "https://rubygems.org/downloads/{}-{}.gem",
890        name, version_plat
891    ))
892}
893
894fn rubygems_api_url(name: &str, version: Option<&str>) -> Option<String> {
895    if name.is_empty() {
896        return None;
897    }
898
899    if let Some(v) = version {
900        Some(format!(
901            "https://rubygems.org/api/v2/rubygems/{}/versions/{}.json",
902            name, v
903        ))
904    } else {
905        Some(format!(
906            "https://rubygems.org/api/v1/versions/{}.json",
907            name
908        ))
909    }
910}
911
912fn get_rubygems_urls(
913    name: &str,
914    version: Option<&str>,
915    platform: Option<&str>,
916) -> (
917    Option<String>,
918    Option<String>,
919    Option<String>,
920    Option<String>,
921) {
922    let repository_homepage_url = rubygems_homepage_url(name, version);
923    let repository_download_url = rubygems_download_url(name, version, platform);
924    let api_data_url = rubygems_api_url(name, version);
925    let download_url = repository_download_url.clone();
926
927    (
928        repository_homepage_url,
929        repository_download_url,
930        api_data_url,
931        download_url,
932    )
933}
934
935/// Returns a default PackageData with gem-specific settings.
936fn default_package_data() -> PackageData {
937    PackageData {
938        package_type: Some(PACKAGE_TYPE),
939        primary_language: Some("Ruby".to_string()),
940        ..Default::default()
941    }
942}
943
944fn default_package_data_with_datasource(datasource_id: DatasourceId) -> PackageData {
945    PackageData {
946        datasource_id: Some(datasource_id),
947        ..default_package_data()
948    }
949}
950
951// =============================================================================
952// Gemspec Parser (Ruby DSL)
953// =============================================================================
954
955/// Ruby .gemspec file parser.
956///
957/// Parses `Gem::Specification.new` blocks using regex-based extraction.
958/// Handles frozen strings (Bug #1), variable version resolution (Bug #2),
959/// and RFC 5322 email parsing (Bug #6).
960pub struct GemspecParser;
961
962impl PackageParser for GemspecParser {
963    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
964
965    fn extract_packages(path: &Path) -> Vec<PackageData> {
966        let datasource_id = gemspec_datasource_id(path);
967        let content = match fs::read_to_string(path) {
968            Ok(c) => c,
969            Err(e) => {
970                warn!("Failed to read .gemspec at {:?}: {}", path, e);
971                return vec![default_package_data_with_datasource(datasource_id)];
972            }
973        };
974
975        let mut package_data = parse_gemspec_with_context(&content, path.parent());
976        package_data.datasource_id = Some(datasource_id);
977        vec![package_data]
978    }
979
980    fn is_match(path: &Path) -> bool {
981        path.extension()
982            .and_then(|ext| ext.to_str())
983            .is_some_and(|ext| ext == "gemspec")
984    }
985}
986
987fn normalized_ruby_path(path: &Path) -> String {
988    path.to_string_lossy().replace('\\', "/")
989}
990
991fn gemfile_datasource_id(path: &Path) -> DatasourceId {
992    if normalized_ruby_path(path).contains("/data.gz-extract/") {
993        DatasourceId::GemfileExtracted
994    } else {
995        DatasourceId::Gemfile
996    }
997}
998
999fn gemfile_lock_datasource_id(path: &Path) -> DatasourceId {
1000    if normalized_ruby_path(path).contains("/data.gz-extract/") {
1001        DatasourceId::GemfileLockExtracted
1002    } else {
1003        DatasourceId::GemfileLock
1004    }
1005}
1006
1007fn gemspec_datasource_id(path: &Path) -> DatasourceId {
1008    let normalized = normalized_ruby_path(path);
1009    if normalized.contains("/data.gz-extract/") {
1010        DatasourceId::GemspecExtracted
1011    } else if normalized.contains("/specifications/") {
1012        DatasourceId::GemGemspecInstalledSpecifications
1013    } else {
1014        DatasourceId::Gemspec
1015    }
1016}
1017
1018/// Cleans a value extracted from gemspec by stripping quotes, .freeze, %q{}, and brackets.
1019fn clean_gemspec_value(s: &str) -> String {
1020    let s = strip_freeze_suffix(s).trim();
1021
1022    let s = if let Some(pos) = s.find(" #") {
1023        s[..pos].trim()
1024    } else {
1025        s
1026    };
1027
1028    let s = if let Some(stripped) = s.strip_prefix("%q{") {
1029        stripped.strip_suffix('}').unwrap_or(stripped)
1030    } else if let Some(stripped) = s.strip_prefix("%q<") {
1031        stripped.strip_suffix('>').unwrap_or(stripped)
1032    } else if let Some(stripped) = s.strip_prefix("%q[") {
1033        stripped.strip_suffix(']').unwrap_or(stripped)
1034    } else if let Some(stripped) = s.strip_prefix("%q(") {
1035        stripped.strip_suffix(')').unwrap_or(stripped)
1036    } else {
1037        s
1038    };
1039
1040    let s = s
1041        .trim_start_matches('"')
1042        .trim_end_matches('"')
1043        .trim_start_matches('\'')
1044        .trim_end_matches('\'');
1045    let s = strip_freeze_suffix(s).trim();
1046    s.to_string()
1047}
1048
1049/// Extracts items from a Ruby array literal like `["a", "b", "c"]`.
1050fn extract_ruby_array(s: &str) -> Vec<String> {
1051    let s = strip_freeze_suffix(s.trim());
1052    let s = s.trim_start_matches('[').trim_end_matches(']');
1053    let item_re = match Regex::new(r#"["']([^"']*?)["'](?:\.freeze)?"#) {
1054        Ok(r) => r,
1055        Err(_) => return Vec::new(),
1056    };
1057    item_re
1058        .captures_iter(s)
1059        .filter_map(|cap| cap.get(1).map(|m| m.as_str().to_string()))
1060        .collect()
1061}
1062
1063fn extract_all_ruby_values(s: &str) -> Vec<String> {
1064    let value_re = match Regex::new(r#"%q[\{<\[(]([^\}>\])]+)[\}>\])]|["']([^"']+)["']"#) {
1065        Ok(r) => r,
1066        Err(_) => return Vec::new(),
1067    };
1068
1069    value_re
1070        .captures_iter(s)
1071        .filter_map(|caps| caps.get(1).or_else(|| caps.get(2)))
1072        .map(|m| clean_gemspec_value(m.as_str()))
1073        .collect()
1074}
1075
1076fn extract_first_ruby_value(s: &str) -> Option<String> {
1077    extract_all_ruby_values(s).into_iter().next()
1078}
1079
1080fn after_first_argument(args: &str) -> &str {
1081    let mut bracket_depth = 0usize;
1082    let mut paren_depth = 0usize;
1083    let mut in_quote: Option<char> = None;
1084    let chars: Vec<(usize, char)> = args.char_indices().collect();
1085    let mut i = 0;
1086
1087    while i < chars.len() {
1088        let (idx, ch) = chars[i];
1089
1090        if let Some(quote) = in_quote {
1091            if ch == '\\' {
1092                i += 2;
1093                continue;
1094            }
1095            if ch == quote {
1096                in_quote = None;
1097            }
1098            i += 1;
1099            continue;
1100        }
1101
1102        match ch {
1103            '\'' | '"' => in_quote = Some(ch),
1104            '[' | '{' | '<' => bracket_depth += 1,
1105            ']' | '}' | '>' => bracket_depth = bracket_depth.saturating_sub(1),
1106            '(' => paren_depth += 1,
1107            ')' => paren_depth = paren_depth.saturating_sub(1),
1108            ',' if bracket_depth == 0 && paren_depth == 0 => return args[idx + 1..].trim(),
1109            _ => {}
1110        }
1111
1112        i += 1;
1113    }
1114
1115    ""
1116}
1117
1118/// Bug #2: Resolves variable version references like `CSV::VERSION` or `RAILS_VERSION`.
1119///
1120/// Scans the file content for constant definitions matching the variable name
1121/// and returns the resolved string value.
1122fn resolve_variable_version(var_name: &str, contexts: &[String]) -> Option<String> {
1123    let var_name = var_name.trim();
1124    if var_name.is_empty() {
1125        return None;
1126    }
1127
1128    for candidate in candidate_constant_names(var_name) {
1129        let escaped = regex::escape(&candidate);
1130        let pattern = format!(r#"(?m)^\s*{}\s*=\s*["']([^"']+)["']"#, escaped);
1131        let Ok(re) = Regex::new(&pattern) else {
1132            continue;
1133        };
1134
1135        for context in contexts {
1136            if let Some(caps) = re.captures(context) {
1137                return caps.get(1).map(|m| m.as_str().to_string());
1138            }
1139        }
1140    }
1141
1142    None
1143}
1144
1145fn resolve_variable_array(var_name: &str, contexts: &[String]) -> Option<Vec<String>> {
1146    let var_name = var_name.trim();
1147    if var_name.is_empty() {
1148        return None;
1149    }
1150
1151    for candidate in candidate_constant_names(var_name) {
1152        let escaped = regex::escape(&candidate);
1153        let pattern = format!(r#"(?m)^\s*{}\s*=\s*(\[[^\n]+\])"#, escaped);
1154        let Ok(re) = Regex::new(&pattern) else {
1155            continue;
1156        };
1157
1158        for context in contexts {
1159            if let Some(caps) = re.captures(context)
1160                && let Some(raw) = caps.get(1)
1161            {
1162                let values = extract_ruby_array(raw.as_str());
1163                if !values.is_empty() {
1164                    return Some(values);
1165                }
1166            }
1167        }
1168    }
1169
1170    None
1171}
1172
1173fn candidate_constant_names(var_name: &str) -> Vec<String> {
1174    let mut names = vec![var_name.to_string()];
1175    if let Some(last) = var_name.split("::").last()
1176        && last != var_name
1177    {
1178        names.push(last.to_string());
1179    }
1180    names
1181}
1182
1183fn load_required_ruby_contexts(content: &str, base_dir: Option<&Path>) -> Vec<String> {
1184    let mut contexts = vec![content.to_string()];
1185    let Some(base_dir) = base_dir else {
1186        return contexts;
1187    };
1188
1189    let require_re = match Regex::new(r#"(?m)^\s*require(?:_relative)?\s+["']([^"']+)["']"#) {
1190        Ok(re) => re,
1191        Err(_) => return contexts,
1192    };
1193
1194    for caps in require_re.captures_iter(content) {
1195        let Some(required) = caps.get(1).map(|m| m.as_str()) else {
1196            continue;
1197        };
1198        for candidate in candidate_require_paths(base_dir, required) {
1199            if let Ok(required_content) = fs::read_to_string(&candidate) {
1200                contexts.push(required_content);
1201                break;
1202            }
1203        }
1204    }
1205
1206    contexts
1207}
1208
1209fn candidate_require_paths(base_dir: &Path, required: &str) -> Vec<PathBuf> {
1210    let relative = required.replace("::", "/");
1211    let filename = if relative.ends_with(".rb") {
1212        relative
1213    } else {
1214        format!("{}.rb", relative)
1215    };
1216
1217    vec![
1218        base_dir.join(&filename),
1219        base_dir.join("lib").join(&filename),
1220    ]
1221}
1222
1223fn looks_like_constant_reference(s: &str) -> bool {
1224    s.contains("::") || s.chars().next().is_some_and(|c| c.is_ascii_uppercase())
1225}
1226
1227/// Parses a .gemspec file content and returns PackageData.
1228#[cfg(test)]
1229fn parse_gemspec(content: &str) -> PackageData {
1230    parse_gemspec_with_context(content, None)
1231}
1232
1233fn parse_gemspec_with_context(content: &str, base_dir: Option<&Path>) -> PackageData {
1234    let contexts = load_required_ruby_contexts(content, base_dir);
1235
1236    // Regex for spec.name = "value" or s.name = "value"
1237    // The spec variable name varies: spec, s, gem, etc.
1238    let field_re = match Regex::new(
1239        r#"(?m)^\s*\w+\.(name|version|summary|description|homepage|license)\s*=\s*(.+)$"#,
1240    ) {
1241        Ok(r) => r,
1242        Err(e) => {
1243            warn!("Failed to compile gemspec field regex: {}", e);
1244            return default_package_data_with_datasource(DatasourceId::Gemspec);
1245        }
1246    };
1247
1248    let licenses_re = match Regex::new(r#"(?m)^\s*\w+\.licenses\s*=\s*(.+)$"#) {
1249        Ok(r) => r,
1250        Err(e) => {
1251            warn!("Failed to compile licenses regex: {}", e);
1252            return default_package_data_with_datasource(DatasourceId::Gemspec);
1253        }
1254    };
1255
1256    let authors_re = match Regex::new(r#"(?m)^\s*\w+\.(?:authors|author)\s*=\s*(.+)$"#) {
1257        Ok(r) => r,
1258        Err(e) => {
1259            warn!("Failed to compile authors regex: {}", e);
1260            return default_package_data_with_datasource(DatasourceId::Gemspec);
1261        }
1262    };
1263
1264    let email_re = match Regex::new(r#"(?m)^\s*\w+\.email\s*=\s*(.+)$"#) {
1265        Ok(r) => r,
1266        Err(e) => {
1267            warn!("Failed to compile email regex: {}", e);
1268            return default_package_data_with_datasource(DatasourceId::Gemspec);
1269        }
1270    };
1271
1272    let dependency_call_re = match Regex::new(
1273        r#"(?m)^\s*\w+\.(add_(?:development_|runtime_)?dependency)\s*\(?(.+?)\)?\s*$"#,
1274    ) {
1275        Ok(r) => r,
1276        Err(e) => {
1277            warn!("Failed to compile gemspec dependency regex: {}", e);
1278            return default_package_data_with_datasource(DatasourceId::Gemspec);
1279        }
1280    };
1281
1282    let mut name: Option<String> = None;
1283    let mut version: Option<String> = None;
1284    let mut summary: Option<String> = None;
1285    let mut description: Option<String> = None;
1286    let mut homepage: Option<String> = None;
1287    let mut license: Option<String> = None;
1288    let mut licenses: Vec<String> = Vec::new();
1289    let mut authors: Vec<String> = Vec::new();
1290    let mut emails: Vec<String> = Vec::new();
1291    let mut dependencies: Vec<Dependency> = Vec::new();
1292
1293    // Extract basic fields
1294    for caps in field_re.captures_iter(content) {
1295        let field_name = match caps.get(1) {
1296            Some(m) => m.as_str(),
1297            None => continue,
1298        };
1299        let raw_value = match caps.get(2) {
1300            Some(m) => m.as_str().trim(),
1301            None => continue,
1302        };
1303
1304        match field_name {
1305            "name" => {
1306                let cleaned = clean_gemspec_value(raw_value);
1307                name = if looks_like_constant_reference(&cleaned) {
1308                    resolve_variable_version(&cleaned, &contexts).or(Some(cleaned))
1309                } else {
1310                    Some(cleaned)
1311                }
1312            }
1313            "version" => {
1314                let cleaned = clean_gemspec_value(raw_value);
1315                // Bug #2: Check if version is a variable reference
1316                if looks_like_constant_reference(&cleaned) {
1317                    version = resolve_variable_version(&cleaned, &contexts).or(Some(cleaned));
1318                } else {
1319                    version = Some(cleaned);
1320                }
1321            }
1322            "summary" => {
1323                let cleaned = clean_gemspec_value(raw_value);
1324                summary = if looks_like_constant_reference(&cleaned) {
1325                    resolve_variable_version(&cleaned, &contexts).or(Some(cleaned))
1326                } else {
1327                    Some(cleaned)
1328                }
1329            }
1330            "description" => description = Some(clean_gemspec_value(raw_value)),
1331            "homepage" => {
1332                let cleaned = clean_gemspec_value(raw_value);
1333                homepage = if looks_like_constant_reference(&cleaned) {
1334                    resolve_variable_version(&cleaned, &contexts).or(Some(cleaned))
1335                } else {
1336                    Some(cleaned)
1337                }
1338            }
1339            "license" => license = Some(clean_gemspec_value(raw_value)),
1340            _ => {}
1341        }
1342    }
1343
1344    // Extract licenses (plural)
1345    for caps in licenses_re.captures_iter(content) {
1346        if let Some(raw) = caps.get(1) {
1347            licenses = extract_ruby_array(raw.as_str());
1348        }
1349    }
1350
1351    // Extract authors
1352    for caps in authors_re.captures_iter(content) {
1353        if let Some(raw) = caps.get(1) {
1354            let raw_str = raw.as_str().trim();
1355            if raw_str.starts_with('[') {
1356                authors = extract_ruby_array(raw_str);
1357            } else if looks_like_constant_reference(raw_str) {
1358                authors = resolve_variable_array(raw_str, &contexts)
1359                    .unwrap_or_else(|| vec![clean_gemspec_value(raw_str)]);
1360            } else {
1361                authors.push(clean_gemspec_value(raw_str));
1362            }
1363        }
1364    }
1365
1366    // Extract emails
1367    for caps in email_re.captures_iter(content) {
1368        if let Some(raw) = caps.get(1) {
1369            let raw_str = raw.as_str().trim();
1370            if raw_str.starts_with('[') {
1371                emails = extract_ruby_array(raw_str);
1372            } else if looks_like_constant_reference(raw_str) {
1373                emails = resolve_variable_array(raw_str, &contexts)
1374                    .unwrap_or_else(|| vec![clean_gemspec_value(raw_str)]);
1375            } else {
1376                emails.push(clean_gemspec_value(raw_str));
1377            }
1378        }
1379    }
1380
1381    // Build parties from authors and emails
1382    let mut parties: Vec<Party> = Vec::new();
1383
1384    if authors.len() == 1 && emails.len() == 1 {
1385        let email_str = emails.first().map(String::as_str);
1386        let (parsed_email_name, parsed_email) = match email_str {
1387            Some(e) => split_name_email(e),
1388            None => (None, None),
1389        };
1390
1391        parties.push(Party {
1392            r#type: Some("person".to_string()),
1393            role: Some("author".to_string()),
1394            name: authors.first().cloned().or(parsed_email_name),
1395            email: parsed_email.or_else(|| {
1396                email_str
1397                    .filter(|e| e.contains('@') && !e.contains('<'))
1398                    .map(|e| e.to_string())
1399            }),
1400            url: None,
1401            organization: None,
1402            organization_url: None,
1403            timezone: None,
1404        });
1405    } else {
1406        for author_name in authors {
1407            parties.push(Party {
1408                r#type: Some("person".to_string()),
1409                role: Some("author".to_string()),
1410                name: Some(author_name),
1411                email: None,
1412                url: None,
1413                organization: None,
1414                organization_url: None,
1415                timezone: None,
1416            });
1417        }
1418
1419        for email_str in emails {
1420            let (parsed_email_name, parsed_email) = if email_str.contains('<') {
1421                split_name_email(&email_str)
1422            } else {
1423                (None, None)
1424            };
1425            parties.push(Party {
1426                r#type: Some("person".to_string()),
1427                role: Some("author".to_string()),
1428                name: parsed_email_name,
1429                email: parsed_email.or_else(|| email_str.contains('@').then_some(email_str)),
1430                url: None,
1431                organization: None,
1432                organization_url: None,
1433                timezone: None,
1434            });
1435        }
1436    }
1437
1438    for caps in dependency_call_re.captures_iter(content) {
1439        let method = match caps.get(1) {
1440            Some(m) => m.as_str(),
1441            None => continue,
1442        };
1443        let args = match caps.get(2) {
1444            Some(m) => m.as_str(),
1445            None => continue,
1446        };
1447
1448        let Some(dep_name) = extract_first_ruby_value(args) else {
1449            continue;
1450        };
1451        let version_parts = extract_all_ruby_values(after_first_argument(args));
1452        let extracted_requirement = if version_parts.is_empty() {
1453            None
1454        } else {
1455            Some(version_parts.join(", "))
1456        };
1457        let purl = create_gem_purl(&dep_name, None);
1458        let is_development = method == "add_development_dependency";
1459        let scope = if is_development {
1460            "development"
1461        } else {
1462            "runtime"
1463        };
1464
1465        dependencies.push(Dependency {
1466            purl,
1467            extracted_requirement,
1468            scope: Some(scope.to_string()),
1469            is_runtime: Some(!is_development),
1470            is_optional: Some(is_development),
1471            is_pinned: None,
1472            is_direct: Some(true),
1473            resolved_package: None,
1474            extra_data: None,
1475        });
1476    }
1477
1478    // Extract license statement only - detection happens in separate engine
1479    let extracted_license_statement = if !licenses.is_empty() {
1480        Some(licenses.join(" AND "))
1481    } else {
1482        license
1483    };
1484
1485    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
1486        normalize_spdx_declared_license(extracted_license_statement.as_deref());
1487
1488    // Prefer description over summary
1489    let final_description = description.or(summary);
1490
1491    // Build PURL
1492    let purl = name
1493        .as_deref()
1494        .map(|n| create_gem_purl(n, version.as_deref()))
1495        .unwrap_or(None);
1496
1497    let (repository_homepage_url, repository_download_url, api_data_url, download_url) =
1498        if let Some(n) = name.as_deref() {
1499            get_rubygems_urls(n, version.as_deref(), None)
1500        } else {
1501            (None, None, None, None)
1502        };
1503
1504    PackageData {
1505        package_type: Some(PACKAGE_TYPE),
1506        name,
1507        version,
1508        primary_language: Some("Ruby".to_string()),
1509        description: final_description,
1510        homepage_url: homepage,
1511        download_url,
1512        declared_license_expression,
1513        declared_license_expression_spdx,
1514        license_detections,
1515        extracted_license_statement,
1516        parties,
1517        dependencies,
1518        repository_homepage_url,
1519        repository_download_url,
1520        api_data_url,
1521        datasource_id: Some(DatasourceId::Gemspec),
1522        purl,
1523        ..default_package_data()
1524    }
1525}
1526
1527// =============================================================================
1528// .gem Archive Parser (Wave 3)
1529// =============================================================================
1530
1531const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; // 100MB
1532const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; // 50MB per file
1533const MAX_COMPRESSION_RATIO: f64 = 100.0; // 100:1 ratio
1534
1535/// Parser for .gem archive files.
1536///
1537/// Extracts metadata from Ruby .gem packages, which are tar archives
1538/// containing a gzip-compressed YAML metadata file (`metadata.gz`).
1539///
1540/// Includes safety checks against zip bombs and oversized archives.
1541pub struct GemArchiveParser;
1542
1543impl PackageParser for GemArchiveParser {
1544    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
1545
1546    fn extract_packages(path: &Path) -> Vec<PackageData> {
1547        vec![match extract_gem_archive(path) {
1548            Ok(data) => data,
1549            Err(e) => {
1550                warn!("Failed to extract .gem archive at {:?}: {}", path, e);
1551                default_package_data_with_datasource(DatasourceId::GemArchive)
1552            }
1553        }]
1554    }
1555
1556    fn is_match(path: &Path) -> bool {
1557        path.extension()
1558            .and_then(|ext| ext.to_str())
1559            .is_some_and(|ext| ext == "gem")
1560    }
1561}
1562
1563fn extract_gem_archive(path: &Path) -> Result<PackageData, String> {
1564    let file_metadata =
1565        fs::metadata(path).map_err(|e| format!("Failed to read file metadata: {}", e))?;
1566    let archive_size = file_metadata.len();
1567
1568    if archive_size > MAX_ARCHIVE_SIZE {
1569        return Err(format!(
1570            "Archive too large: {} bytes (limit: {} bytes)",
1571            archive_size, MAX_ARCHIVE_SIZE
1572        ));
1573    }
1574
1575    let file = File::open(path).map_err(|e| format!("Failed to open archive: {}", e))?;
1576    let mut archive = Archive::new(file);
1577
1578    for entry_result in archive
1579        .entries()
1580        .map_err(|e| format!("Failed to read tar entries: {}", e))?
1581    {
1582        let entry = entry_result.map_err(|e| format!("Failed to read tar entry: {}", e))?;
1583        let entry_path = entry
1584            .path()
1585            .map_err(|e| format!("Failed to get entry path: {}", e))?;
1586
1587        if entry_path.to_str() == Some("metadata.gz") {
1588            let entry_size = entry.size();
1589            if entry_size > MAX_FILE_SIZE {
1590                return Err(format!(
1591                    "metadata.gz too large: {} bytes (limit: {} bytes)",
1592                    entry_size, MAX_FILE_SIZE
1593                ));
1594            }
1595
1596            let mut decoder = GzDecoder::new(entry);
1597            let mut content = String::new();
1598            decoder
1599                .read_to_string(&mut content)
1600                .map_err(|e| format!("Failed to decompress metadata.gz: {}", e))?;
1601
1602            let uncompressed_size = content.len() as u64;
1603            if entry_size > 0 {
1604                let ratio = uncompressed_size as f64 / entry_size as f64;
1605                if ratio > MAX_COMPRESSION_RATIO {
1606                    return Err(format!(
1607                        "Suspicious compression ratio: {:.2}:1 (limit: {:.0}:1)",
1608                        ratio, MAX_COMPRESSION_RATIO
1609                    ));
1610                }
1611            }
1612            if uncompressed_size > MAX_FILE_SIZE {
1613                return Err(format!(
1614                    "Decompressed metadata too large: {} bytes (limit: {} bytes)",
1615                    uncompressed_size, MAX_FILE_SIZE
1616                ));
1617            }
1618
1619            return parse_gem_metadata_yaml(&content, DatasourceId::GemArchive);
1620        }
1621    }
1622
1623    Err("metadata.gz not found in .gem archive".to_string())
1624}
1625
1626fn parse_gem_metadata_yaml(
1627    content: &str,
1628    datasource_id: DatasourceId,
1629) -> Result<PackageData, String> {
1630    // Ruby YAML tagged types need to be handled:
1631    // --- !ruby/object:Gem::Specification
1632    // We strip Ruby-specific YAML tags since yaml_serde can't handle them
1633    let cleaned = clean_ruby_yaml_tags(content);
1634
1635    let yaml: yaml_serde::Value =
1636        yaml_serde::from_str(&cleaned).map_err(|e| format!("Failed to parse YAML: {}", e))?;
1637
1638    let name = yaml_string(&yaml, "name");
1639    let version = yaml.get("version").and_then(|v| {
1640        // version can be a simple string or a mapping with a "version" key
1641        if v.is_string() {
1642            v.as_str().map(|s| s.to_string())
1643        } else {
1644            yaml_string(v, "version")
1645        }
1646    });
1647    let description = yaml_string(&yaml, "description").or_else(|| yaml_string(&yaml, "summary"));
1648    let homepage = yaml_string(&yaml, "homepage");
1649    let summary = yaml_string(&yaml, "summary");
1650
1651    // Licenses
1652    let licenses: Vec<String> = yaml
1653        .get("licenses")
1654        .and_then(|v| v.as_sequence())
1655        .map(|seq| {
1656            seq.iter()
1657                .filter_map(|item| item.as_str().map(|s| s.to_string()))
1658                .collect()
1659        })
1660        .unwrap_or_default();
1661
1662    // Extract license statement only - detection happens in separate engine
1663    let extracted_license_statement = if !licenses.is_empty() {
1664        Some(licenses.join(" AND "))
1665    } else {
1666        None
1667    };
1668
1669    let (license_expression, license_expression_spdx, license_detections) =
1670        normalize_spdx_declared_license(extracted_license_statement.as_deref());
1671
1672    // Authors
1673    let authors: Vec<String> = yaml
1674        .get("authors")
1675        .and_then(|v| v.as_sequence())
1676        .map(|seq| {
1677            seq.iter()
1678                .filter_map(|item| item.as_str().map(|s| s.to_string()))
1679                .collect()
1680        })
1681        .unwrap_or_default();
1682
1683    let emails: Vec<String> = yaml
1684        .get("email")
1685        .map(|v| {
1686            if let Some(seq) = v.as_sequence() {
1687                seq.iter()
1688                    .filter_map(|item| item.as_str().map(|s| s.to_string()))
1689                    .collect()
1690            } else if let Some(s) = v.as_str() {
1691                vec![s.to_string()]
1692            } else {
1693                Vec::new()
1694            }
1695        })
1696        .unwrap_or_default();
1697
1698    // Build parties
1699    let mut parties: Vec<Party> = Vec::new();
1700    let max_len = authors.len().max(emails.len());
1701    for i in 0..max_len {
1702        let author_name = authors.get(i).map(|s| s.as_str());
1703        let email_str = emails.get(i).map(|s| s.as_str());
1704
1705        let (parsed_email_name, parsed_email) = match email_str {
1706            Some(e) if e.contains('<') => split_name_email(e),
1707            None => (None, None),
1708            _ => (None, None),
1709        };
1710
1711        let party_name = author_name.map(|s| s.to_string()).or(parsed_email_name);
1712
1713        parties.push(Party {
1714            r#type: Some("person".to_string()),
1715            role: Some("author".to_string()),
1716            name: party_name,
1717            email: parsed_email.or_else(|| {
1718                email_str
1719                    .filter(|e| e.contains('@') && !e.contains('<'))
1720                    .map(|e| e.to_string())
1721            }),
1722            url: None,
1723            organization: None,
1724            organization_url: None,
1725            timezone: None,
1726        });
1727    }
1728
1729    // Dependencies
1730    let dependencies = parse_gem_yaml_dependencies(&yaml);
1731
1732    let metadata = yaml.get("metadata");
1733
1734    let bug_tracking_url = metadata.and_then(|m| yaml_string(m, "bug_tracking_uri"));
1735
1736    let code_view_url = metadata.and_then(|m| yaml_string(m, "source_code_uri"));
1737
1738    let vcs_url = code_view_url
1739        .clone()
1740        .or_else(|| metadata.and_then(|m| yaml_string(m, "homepage_uri")));
1741
1742    let file_references = metadata
1743        .and_then(|m| m.get("files"))
1744        .and_then(|f| f.as_sequence())
1745        .map(|seq| {
1746            seq.iter()
1747                .filter_map(|v| v.as_str())
1748                .map(|s| crate::models::FileReference {
1749                    path: s.to_string(),
1750                    size: None,
1751                    sha1: None,
1752                    md5: None,
1753                    sha256: None,
1754                    sha512: None,
1755                    extra_data: None,
1756                })
1757                .collect::<Vec<_>>()
1758        })
1759        .unwrap_or_default();
1760
1761    let release_date = yaml_string(&yaml, "date").and_then(|d| {
1762        if d.len() >= 10 {
1763            Some(d[..10].to_string())
1764        } else {
1765            None
1766        }
1767    });
1768
1769    let purl = name
1770        .as_deref()
1771        .map(|n| create_gem_purl(n, version.as_deref()))
1772        .unwrap_or(None);
1773
1774    let platform = yaml_string(&yaml, "platform");
1775    let (repository_homepage_url, repository_download_url, api_data_url, download_url) =
1776        if let Some(n) = name.as_deref() {
1777            get_rubygems_urls(n, version.as_deref(), platform.as_deref())
1778        } else {
1779            (None, None, None, None)
1780        };
1781
1782    let qualifiers = if let Some(ref p) = platform {
1783        if p != "ruby" {
1784            let mut q = HashMap::new();
1785            q.insert("platform".to_string(), p.clone());
1786            Some(q)
1787        } else {
1788            None
1789        }
1790    } else {
1791        None
1792    };
1793
1794    Ok(PackageData {
1795        package_type: Some(PACKAGE_TYPE),
1796        name,
1797        version,
1798        qualifiers,
1799        primary_language: Some("Ruby".to_string()),
1800        description: description.or(summary),
1801        release_date,
1802        homepage_url: homepage,
1803        download_url,
1804        bug_tracking_url,
1805        code_view_url,
1806        declared_license_expression: license_expression,
1807        declared_license_expression_spdx: license_expression_spdx,
1808        license_detections,
1809        extracted_license_statement,
1810        file_references,
1811        parties,
1812        dependencies,
1813        repository_homepage_url,
1814        repository_download_url,
1815        api_data_url,
1816        datasource_id: Some(datasource_id),
1817        purl,
1818        vcs_url,
1819        ..default_package_data()
1820    })
1821}
1822
1823/// Strips Ruby-specific YAML tags that yaml_serde cannot handle.
1824fn clean_ruby_yaml_tags(content: &str) -> String {
1825    let tag_re = match Regex::new(r"!ruby/\S+") {
1826        Ok(r) => r,
1827        Err(_) => return content.to_string(),
1828    };
1829    tag_re.replace_all(content, "").to_string()
1830}
1831
1832fn yaml_string(yaml: &yaml_serde::Value, key: &str) -> Option<String> {
1833    yaml.get(key)
1834        .and_then(|v| v.as_str())
1835        .filter(|s| !s.is_empty())
1836        .map(|s| s.to_string())
1837}
1838
1839fn parse_gem_yaml_dependencies(yaml: &yaml_serde::Value) -> Vec<Dependency> {
1840    let mut dependencies = Vec::new();
1841
1842    let deps_seq = match yaml.get("dependencies").and_then(|v| v.as_sequence()) {
1843        Some(seq) => seq,
1844        None => return dependencies,
1845    };
1846
1847    for dep_value in deps_seq {
1848        let dep_name = match yaml_string(dep_value, "name") {
1849            Some(n) => n,
1850            None => continue,
1851        };
1852
1853        let dep_type = yaml_string(dep_value, "type");
1854        let is_development = dep_type.as_deref() == Some(":development");
1855
1856        // Extract version requirements from the nested structure
1857        let requirements = dep_value
1858            .get("requirement")
1859            .or_else(|| dep_value.get("version_requirements"))
1860            .and_then(|req| req.get("requirements"))
1861            .and_then(|reqs| reqs.as_sequence());
1862
1863        let extracted_requirement = requirements.map(|reqs| {
1864            let parts: Vec<String> = reqs
1865                .iter()
1866                .filter_map(|req| {
1867                    let seq = req.as_sequence()?;
1868                    if seq.len() >= 2 {
1869                        let op = seq[0].as_str().unwrap_or("");
1870                        let ver = seq[1].get("version").and_then(|v| v.as_str()).unwrap_or("");
1871                        if op == ">=" && ver == "0" {
1872                            // ">= 0" means "any version" - skip
1873                            None
1874                        } else if op.is_empty() || ver.is_empty() {
1875                            None
1876                        } else {
1877                            Some(format!("{} {}", op, ver))
1878                        }
1879                    } else {
1880                        None
1881                    }
1882                })
1883                .collect();
1884            parts.join(", ")
1885        });
1886
1887        let extracted_requirement = extracted_requirement
1888            .filter(|s| !s.is_empty())
1889            .or_else(|| Some(String::new()));
1890
1891        let (scope, is_runtime, is_optional) = if is_development {
1892            (Some("development".to_string()), false, true)
1893        } else {
1894            (Some("runtime".to_string()), true, false)
1895        };
1896
1897        let purl = create_gem_purl(&dep_name, None);
1898
1899        dependencies.push(Dependency {
1900            purl,
1901            extracted_requirement,
1902            scope,
1903            is_runtime: Some(is_runtime),
1904            is_optional: Some(is_optional),
1905            is_pinned: None,
1906            is_direct: Some(true),
1907            resolved_package: None,
1908            extra_data: None,
1909        });
1910    }
1911
1912    dependencies
1913}
1914
1915// =============================================================================
1916// Gem Metadata Extracted Parser (metadata.gz-extract files)
1917// =============================================================================
1918
1919pub struct GemMetadataExtractedParser;
1920
1921impl PackageParser for GemMetadataExtractedParser {
1922    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
1923
1924    fn extract_packages(path: &Path) -> Vec<PackageData> {
1925        vec![match extract_gem_metadata_extracted(path) {
1926            Ok(data) => data,
1927            Err(e) => {
1928                warn!("Failed to extract gem metadata from {:?}: {}", path, e);
1929                default_package_data_with_datasource(DatasourceId::GemArchiveExtracted)
1930            }
1931        }]
1932    }
1933
1934    fn is_match(path: &Path) -> bool {
1935        path.to_str()
1936            .is_some_and(|p| p.contains("metadata.gz-extract"))
1937    }
1938}
1939
1940fn extract_gem_metadata_extracted(path: &Path) -> Result<PackageData, String> {
1941    let content = fs::read_to_string(path)
1942        .map_err(|e| format!("Failed to read metadata.gz-extract file: {}", e))?;
1943
1944    parse_gem_metadata_yaml(&content, DatasourceId::GemArchiveExtracted)
1945}
1946
1947// Register parser with metadata
1948crate::register_parser!(
1949    "Ruby Gemfile manifest",
1950    &["**/Gemfile", "**/data.gz-extract/Gemfile"],
1951    "gem",
1952    "Ruby",
1953    Some("https://bundler.io/man/gemfile.5.html"),
1954);
1955
1956crate::register_parser!(
1957    "Ruby Gemfile.lock lockfile",
1958    &["**/Gemfile.lock", "**/data.gz-extract/Gemfile.lock"],
1959    "gem",
1960    "Ruby",
1961    Some("https://bundler.io/man/gemfile.5.html"),
1962);
1963
1964crate::register_parser!(
1965    "Ruby .gemspec manifest",
1966    &[
1967        "**/*.gemspec",
1968        "**/data.gz-extract/*.gemspec",
1969        "**/specifications/*.gemspec"
1970    ],
1971    "gem",
1972    "Ruby",
1973    Some("https://guides.rubygems.org/specification-reference/"),
1974);
1975
1976crate::register_parser!(
1977    "Ruby .gem archive",
1978    &["**/*.gem"],
1979    "gem",
1980    "Ruby",
1981    Some("https://guides.rubygems.org/specification-reference/"),
1982);
1983
1984crate::register_parser!(
1985    "Ruby gem metadata (extracted)",
1986    &["**/metadata.gz-extract"],
1987    "gem",
1988    "Ruby",
1989    Some("https://guides.rubygems.org/specification-reference/"),
1990);
1991
1992#[cfg(test)]
1993mod tests {
1994    use super::parse_gemspec;
1995
1996    #[test]
1997    fn test_clean_gemspec_value_handles_unterminated_percent_q() {
1998        assert_eq!(
1999            super::clean_gemspec_value("%q{Arel is a SQL AST manager for Ruby. It"),
2000            "Arel is a SQL AST manager for Ruby. It"
2001        );
2002    }
2003
2004    #[test]
2005    fn test_parse_gemspec_runtime_dependency_scope() {
2006        let content = r#"
2007Gem::Specification.new do |spec|
2008  spec.name = "demo"
2009  spec.version = "1.0.0"
2010  spec.add_runtime_dependency "rack", "~> 3.0"
2011  spec.add_dependency "thor", ">= 1.0"
2012end
2013"#;
2014
2015        let package_data = parse_gemspec(content);
2016        assert_eq!(package_data.dependencies.len(), 2);
2017        assert_eq!(
2018            package_data.dependencies[0].scope,
2019            Some("runtime".to_string())
2020        );
2021        assert_eq!(
2022            package_data.dependencies[0].extracted_requirement,
2023            Some("~> 3.0".to_string())
2024        );
2025        assert_eq!(
2026            package_data.dependencies[1].scope,
2027            Some("runtime".to_string())
2028        );
2029        assert_eq!(
2030            package_data.dependencies[1].extracted_requirement,
2031            Some(">= 1.0".to_string())
2032        );
2033    }
2034}