Skip to main content

provenant/parsers/
ruby.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Parser for Ruby/RubyGems package manifests.
5//!
6//! Extracts package metadata, dependencies, and platform information from
7//! Gemfile and Gemfile.lock files used by Ruby/Bundler projects.
8//!
9//! # Supported Formats
10//! - Gemfile (manifest with Ruby DSL)
11//! - Gemfile.lock (lockfile with state machine sections)
12//! - *.gemspec (gem specification files)
13//! - *.gem (gem archive packages)
14//! - metadata.gz-extract (pre-extracted gem metadata)
15//!
16//! # Key Features
17//! - State machine parsing for Gemfile.lock sections (GEM, GIT, PATH, SVN, PLATFORMS, BUNDLED WITH, DEPENDENCIES)
18//! - Regex-based Ruby DSL parsing for Gemfile
19//! - Dependency group handling (:development, :test, etc.)
20//! - Platform-specific gem support
21//! - Pessimistic version operator (~>) support
22//! - Bug Fix #1: Strip .freeze suffix from strings
23//! - Bug Fix #4: Correct dependency scope mapping (:runtime → None, :development → "development")
24//!
25//! # Implementation Notes
26//! - Uses regex for pattern matching (not full Ruby AST)
27//! - Graceful error handling: logs warnings and returns default on parse failure
28//! - PURL type: "gem"
29
30use crate::models::{DatasourceId, Dependency, PackageData, PackageType, Party};
31use crate::parser_warn as warn;
32use crate::parsers::utils::{
33    MAX_ITERATION_COUNT, read_file_to_string, split_name_email, truncate_field,
34};
35use flate2::read::GzDecoder;
36use packageurl::PackageUrl;
37use regex::Regex;
38use std::collections::HashMap;
39use std::fs::{self, File};
40use std::io::Read;
41use std::path::{Path, PathBuf};
42use tar::Archive;
43
44use super::PackageParser;
45use super::license_normalization::normalize_spdx_declared_license;
46use super::metadata::ParserMetadata;
47
48const PACKAGE_TYPE: PackageType = PackageType::Gem;
49
50// =============================================================================
51// Bug Fix #1: Strip .freeze suffix from strings
52// =============================================================================
53
54/// Strips the `.freeze` suffix from Ruby frozen string literals.
55///
56/// In Ruby, `.freeze` makes a string immutable. We need to remove this suffix
57/// when parsing gem names and versions from Gemfile.
58///
59/// For example, `"name".freeze` becomes `"name"` and `'1.0.0'.freeze`
60/// becomes `'1.0.0'`.
61pub fn strip_freeze_suffix(s: &str) -> &str {
62    s.trim_end_matches(".freeze")
63}
64
65enum GemfileBlock {
66    Group(Vec<String>),
67    Source(String),
68}
69
70// =============================================================================
71// Gemfile Parser (Ruby DSL)
72// =============================================================================
73
74/// Ruby Gemfile parser for manifest files.
75///
76/// Parses Ruby DSL syntax to extract gem declarations, dependency groups,
77/// platform-specific gems, and version constraints.
78pub struct GemfileParser;
79
80impl PackageParser for GemfileParser {
81    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
82
83    fn metadata() -> Vec<ParserMetadata> {
84        vec![ParserMetadata {
85            description: "Ruby Gemfile manifest",
86            file_patterns: &["**/Gemfile", "**/data.gz-extract/Gemfile"],
87            package_type: "gem",
88            primary_language: "Ruby",
89            documentation_url: Some("https://bundler.io/man/gemfile.5.html"),
90        }]
91    }
92
93    fn extract_packages(path: &Path) -> Vec<PackageData> {
94        let datasource_id = gemfile_datasource_id(path);
95        let content = match read_file_to_string(path, None) {
96            Ok(c) => c,
97            Err(e) => {
98                warn!("Failed to read Gemfile at {:?}: {}", path, e);
99                return vec![default_package_data_with_datasource(datasource_id)];
100            }
101        };
102
103        let mut package_data = parse_gemfile(&content);
104        package_data.datasource_id = Some(datasource_id);
105        vec![package_data]
106    }
107
108    fn is_match(path: &Path) -> bool {
109        path.file_name()
110            .and_then(|n| n.to_str())
111            .is_some_and(|name| name == "Gemfile")
112            || path
113                .to_str()
114                .is_some_and(|p| p.contains("data.gz-extract/") && p.ends_with("/Gemfile"))
115    }
116}
117
118/// Parses Gemfile content and extracts dependencies with groups.
119fn parse_gemfile(content: &str) -> PackageData {
120    let mut dependencies = Vec::new();
121    let mut block_stack = Vec::new();
122    let mut default_source = None;
123    let mut sources = Vec::new();
124
125    // Regex patterns for Gemfile parsing
126    // gem "name", "version", options...
127    let gem_regex = match Regex::new(
128        r#"^\s*gem\s+["']([^"']+)["'](?:\.freeze)?(?:\s*,\s*["']([^"']+)["'](?:\.freeze)?)?(?:\s*,\s*["']([^"']+)["'](?:\.freeze)?)?(?:\s*,\s*(.+))?"#,
129    ) {
130        Ok(r) => r,
131        Err(e) => {
132            warn!("Failed to compile gem regex: {}", e);
133            return default_package_data_with_datasource(DatasourceId::Gemfile);
134        }
135    };
136
137    // group :name do ... end
138    let group_start_regex = match Regex::new(r"^\s*group\s+(.+?)\s+do\s*$") {
139        Ok(r) => r,
140        Err(e) => {
141            warn!("Failed to compile group regex: {}", e);
142            return default_package_data_with_datasource(DatasourceId::Gemfile);
143        }
144    };
145
146    let group_end_regex = match Regex::new(r"^\s*end\s*$") {
147        Ok(r) => r,
148        Err(e) => {
149            warn!("Failed to compile end regex: {}", e);
150            return default_package_data_with_datasource(DatasourceId::Gemfile);
151        }
152    };
153
154    let source_block_start_regex = match Regex::new(r#"^\s*source\s+["']([^"']+)["']\s+do\s*$"#) {
155        Ok(r) => r,
156        Err(e) => {
157            warn!("Failed to compile source block regex: {}", e);
158            return default_package_data_with_datasource(DatasourceId::Gemfile);
159        }
160    };
161
162    let source_regex = match Regex::new(r#"^\s*source\s+["']([^"']+)["']\s*$"#) {
163        Ok(r) => r,
164        Err(e) => {
165            warn!("Failed to compile source regex: {}", e);
166            return default_package_data_with_datasource(DatasourceId::Gemfile);
167        }
168    };
169
170    // Parse symbols like :development, :test
171    let symbol_regex = match Regex::new(r":(\w+)") {
172        Ok(r) => r,
173        Err(e) => {
174            warn!("Failed to compile symbol regex: {}", e);
175            return default_package_data_with_datasource(DatasourceId::Gemfile);
176        }
177    };
178
179    for line in content.lines().take(MAX_ITERATION_COUNT) {
180        let trimmed = line.trim();
181
182        // Skip comments and empty lines
183        if trimmed.is_empty() || trimmed.starts_with('#') {
184            continue;
185        }
186
187        // Check for group start
188        if let Some(caps) = group_start_regex.captures(trimmed) {
189            let groups_str = caps.get(1).map(|m| m.as_str()).unwrap_or("");
190            let mut current_groups = Vec::new();
191            for cap in symbol_regex.captures_iter(groups_str) {
192                if let Some(group_name) = cap.get(1) {
193                    current_groups.push(group_name.as_str().to_string());
194                }
195            }
196            block_stack.push(GemfileBlock::Group(current_groups));
197            continue;
198        }
199
200        if let Some(caps) = source_block_start_regex.captures(trimmed) {
201            let source = caps
202                .get(1)
203                .map(|m| m.as_str().to_string())
204                .unwrap_or_default();
205            if !source.is_empty() {
206                push_unique_string(&mut sources, source.clone());
207                block_stack.push(GemfileBlock::Source(source));
208            }
209            continue;
210        }
211
212        if let Some(caps) = source_regex.captures(trimmed) {
213            if let Some(source) = caps.get(1).map(|m| m.as_str().to_string()) {
214                push_unique_string(&mut sources, source.clone());
215                default_source = Some(source);
216            }
217            continue;
218        }
219
220        // Check for group end
221        if group_end_regex.is_match(trimmed) {
222            block_stack.pop();
223            continue;
224        }
225
226        // Parse gem declaration
227        if let Some(caps) = gem_regex.captures(trimmed) {
228            let name = strip_freeze_suffix(caps.get(1).map(|m| m.as_str()).unwrap_or(""));
229            if name.is_empty() {
230                continue;
231            }
232
233            // Collect version constraints
234            let mut version_parts = Vec::new();
235            if let Some(v) = caps.get(2) {
236                version_parts.push(strip_freeze_suffix(v.as_str()).to_string());
237            }
238            if let Some(v) = caps.get(3) {
239                let v_str = strip_freeze_suffix(v.as_str());
240                // Check if it looks like a version constraint
241                if looks_like_version_constraint(v_str) {
242                    version_parts.push(v_str.to_string());
243                }
244            }
245
246            let extracted_requirement = if version_parts.is_empty() {
247                None
248            } else {
249                Some(version_parts.join(", "))
250            };
251
252            let current_groups = current_group_names(&block_stack);
253
254            // Determine scope based on current group
255            // Bug Fix #4: :runtime → None, :development → "development"
256            let (scope, is_runtime, is_optional) = if current_groups.is_empty() {
257                // No group = runtime dependency
258                (None, true, false)
259            } else if current_groups.iter().any(|g| g == "development") {
260                (Some("development".to_string()), false, true)
261            } else if current_groups.iter().any(|g| g == "test") {
262                (Some("test".to_string()), false, true)
263            } else {
264                // Other groups (e.g., :production)
265                let group = current_groups.first().cloned();
266                (group, true, false)
267            };
268
269            // Create PURL
270            let purl = create_gem_purl(name, None);
271            let inherited_source = current_source(&block_stack, default_source.as_deref());
272            let extra_data = build_gemfile_dependency_extra_data(
273                caps.get(4).map(|m| m.as_str()),
274                inherited_source.as_deref(),
275            );
276
277            dependencies.push(Dependency {
278                purl,
279                extracted_requirement,
280                scope,
281                is_runtime: Some(is_runtime),
282                is_optional: Some(is_optional),
283                is_pinned: None,
284                is_direct: Some(true),
285                resolved_package: None,
286                extra_data,
287            });
288        }
289    }
290
291    let extra_data = if sources.is_empty() {
292        None
293    } else {
294        Some(HashMap::from([(
295            "sources".to_string(),
296            serde_json::Value::Array(sources.into_iter().map(serde_json::Value::String).collect()),
297        )]))
298    };
299
300    PackageData {
301        package_type: Some(PACKAGE_TYPE),
302        primary_language: Some("Ruby".to_string()),
303        dependencies,
304        extra_data,
305        datasource_id: Some(DatasourceId::Gemfile),
306        ..default_package_data()
307    }
308}
309
310fn current_group_names(block_stack: &[GemfileBlock]) -> Vec<String> {
311    block_stack
312        .iter()
313        .rev()
314        .find_map(|block| match block {
315            GemfileBlock::Group(groups) => Some(groups.clone()),
316            GemfileBlock::Source(_) => None,
317        })
318        .unwrap_or_default()
319}
320
321fn current_source(block_stack: &[GemfileBlock], default_source: Option<&str>) -> Option<String> {
322    block_stack
323        .iter()
324        .rev()
325        .find_map(|block| match block {
326            GemfileBlock::Source(source) => Some(source.clone()),
327            GemfileBlock::Group(_) => None,
328        })
329        .or_else(|| default_source.map(str::to_string))
330}
331
332fn push_unique_string(values: &mut Vec<String>, value: String) {
333    if !values.contains(&value) {
334        values.push(value);
335    }
336}
337
338fn build_gemfile_dependency_extra_data(
339    options: Option<&str>,
340    inherited_source: Option<&str>,
341) -> Option<HashMap<String, serde_json::Value>> {
342    let mut extra = HashMap::new();
343    let options = options.unwrap_or("");
344
345    if let Some(git) = extract_gemfile_quoted_option(options, "git") {
346        extra.insert(
347            "source_type".to_string(),
348            serde_json::Value::String("GIT".to_string()),
349        );
350        extra.insert("git".to_string(), serde_json::Value::String(git.clone()));
351        extra.insert("remote".to_string(), serde_json::Value::String(git));
352    }
353
354    if let Some(path) = extract_gemfile_quoted_option(options, "path") {
355        extra.insert(
356            "source_type".to_string(),
357            serde_json::Value::String("PATH".to_string()),
358        );
359        extra.insert("path".to_string(), serde_json::Value::String(path));
360    }
361
362    for key in ["branch", "ref", "tag"] {
363        if let Some(value) = extract_gemfile_quoted_option(options, key) {
364            extra.insert(key.to_string(), serde_json::Value::String(value));
365        }
366    }
367
368    let direct_source = extract_gemfile_quoted_option(options, "source");
369    if let Some(source) = direct_source {
370        extra.insert("source".to_string(), serde_json::Value::String(source));
371    } else if !extra.contains_key("source_type")
372        && let Some(source) = inherited_source
373    {
374        extra.insert(
375            "source".to_string(),
376            serde_json::Value::String(source.to_string()),
377        );
378    }
379
380    (!extra.is_empty()).then_some(extra)
381}
382
383fn extract_gemfile_quoted_option(options: &str, key: &str) -> Option<String> {
384    if options.is_empty() {
385        return None;
386    }
387
388    let pattern = format!(r#"(?:^|,\s*){}\s*:\s*["']([^"']+)["']"#, regex::escape(key));
389    Regex::new(&pattern)
390        .ok()
391        .and_then(|regex| regex.captures(options))
392        .and_then(|captures| captures.get(1).map(|m| m.as_str().to_string()))
393}
394
395/// Checks if a string looks like a version constraint.
396fn looks_like_version_constraint(s: &str) -> bool {
397    s.starts_with('~')
398        || s.starts_with('>')
399        || s.starts_with('<')
400        || s.starts_with('=')
401        || s.starts_with('!')
402        || s.chars().next().is_some_and(|c| c.is_ascii_digit())
403}
404
405// =============================================================================
406// Gemfile.lock Parser (State Machine)
407// =============================================================================
408
409/// Ruby Gemfile.lock parser for lockfiles.
410///
411/// Uses a state machine to parse sections: GEM, GIT, PATH, SVN,
412/// PLATFORMS, BUNDLED WITH, DEPENDENCIES.
413pub struct GemfileLockParser;
414
415impl PackageParser for GemfileLockParser {
416    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
417
418    fn metadata() -> Vec<ParserMetadata> {
419        vec![ParserMetadata {
420            description: "Ruby Gemfile.lock lockfile",
421            file_patterns: &["**/Gemfile.lock", "**/data.gz-extract/Gemfile.lock"],
422            package_type: "gem",
423            primary_language: "Ruby",
424            documentation_url: Some("https://bundler.io/man/gemfile.5.html"),
425        }]
426    }
427
428    fn extract_packages(path: &Path) -> Vec<PackageData> {
429        let datasource_id = gemfile_lock_datasource_id(path);
430        let content = match read_file_to_string(path, None) {
431            Ok(c) => c,
432            Err(e) => {
433                warn!("Failed to read Gemfile.lock at {:?}: {}", path, e);
434                return vec![default_package_data_with_datasource(datasource_id)];
435            }
436        };
437
438        let mut package_data = parse_gemfile_lock(&content);
439        package_data.datasource_id = Some(datasource_id);
440        vec![package_data]
441    }
442
443    fn is_match(path: &Path) -> bool {
444        path.file_name()
445            .and_then(|n| n.to_str())
446            .is_some_and(|name| name == "Gemfile.lock")
447            || path
448                .to_str()
449                .is_some_and(|p| p.contains("data.gz-extract/") && p.ends_with("/Gemfile.lock"))
450    }
451}
452
453/// Parse state for Gemfile.lock state machine.
454#[derive(Debug, Clone, PartialEq)]
455enum ParseState {
456    None,
457    Gem,
458    Git,
459    Path,
460    Svn,
461    Specs,
462    Platforms,
463    BundledWith,
464    Dependencies,
465}
466
467/// Parsed gem information from Gemfile.lock.
468///
469/// All fields are actively used:
470/// - `gem_type`, `remote`, `revision`, `ref_field`, `branch`, `tag`: Stored in extra_data for GIT/PATH/SVN sources
471/// - `name`, `version`, `platform`, `pinned`: Used for dependency PURL and metadata generation
472/// - `requirements`: Stored as extracted_requirement for version constraints
473#[derive(Debug, Clone, Default)]
474struct GemInfo {
475    name: String,
476    version: Option<String>,
477    platform: Option<String>,
478    gem_type: String,
479    remote: Option<String>,
480    revision: Option<String>,
481    ref_field: Option<String>,
482    branch: Option<String>,
483    tag: Option<String>,
484    pinned: bool,
485    requirements: Vec<String>,
486}
487
488fn select_primary_path_gem(gems: &HashMap<String, GemInfo>) -> Option<GemInfo> {
489    let mut path_gems: Vec<&GemInfo> = gems.values().filter(|gem| gem.gem_type == "PATH").collect();
490    path_gems.sort_by(|left, right| {
491        left.remote
492            .as_deref()
493            .cmp(&right.remote.as_deref())
494            .then_with(|| left.name.cmp(&right.name))
495    });
496
497    path_gems
498        .iter()
499        .copied()
500        .find(|gem| gem.pinned && gem.remote.as_deref() == Some("."))
501        .or_else(|| path_gems.iter().copied().find(|gem| gem.pinned))
502        .or_else(|| {
503            path_gems
504                .iter()
505                .copied()
506                .find(|gem| gem.remote.as_deref() == Some("."))
507        })
508        .or_else(|| path_gems.first().copied())
509        .cloned()
510}
511
512/// Parses Gemfile.lock content using a state machine.
513fn parse_gemfile_lock(content: &str) -> PackageData {
514    let mut state = ParseState::None;
515    let mut dependencies = Vec::new();
516    let mut gems: HashMap<String, GemInfo> = HashMap::new();
517    let mut platforms: Vec<String> = Vec::new();
518    let mut bundler_version: Option<String> = None;
519    let mut current_gem_type = String::new();
520    let mut current_remote: Option<String> = None;
521    let mut current_options: HashMap<String, String> = HashMap::new();
522
523    // DEPS pattern: 2 spaces at line start
524    let deps_regex = match Regex::new(r"^ {2}([^ \)\(,!:]+)(?: \(([^)]+)\))?(!)?$") {
525        Ok(r) => r,
526        Err(e) => {
527            warn!("Failed to compile deps regex: {}", e);
528            return default_package_data_with_datasource(DatasourceId::GemfileLock);
529        }
530    };
531
532    // SPEC_DEPS pattern: 4 spaces at line start
533    let spec_deps_regex = match Regex::new(r"^ {4}([^ \)\(,!:]+)(?: \(([^)]+)\))?$") {
534        Ok(r) => r,
535        Err(e) => {
536            warn!("Failed to compile spec_deps regex: {}", e);
537            return default_package_data_with_datasource(DatasourceId::GemfileLock);
538        }
539    };
540
541    // OPTIONS pattern: key: value
542    let options_regex = match Regex::new(r"^ {2}([a-z]+): (.+)$") {
543        Ok(r) => r,
544        Err(e) => {
545            warn!("Failed to compile options regex: {}", e);
546            return default_package_data_with_datasource(DatasourceId::GemfileLock);
547        }
548    };
549
550    // VERSION pattern for BUNDLED WITH
551    let version_regex = match Regex::new(r"^\s+(\d+(?:\.\d+)+)\s*$") {
552        Ok(r) => r,
553        Err(e) => {
554            warn!("Failed to compile version regex: {}", e);
555            return default_package_data_with_datasource(DatasourceId::GemfileLock);
556        }
557    };
558
559    for line in content.lines().take(MAX_ITERATION_COUNT) {
560        let trimmed = line.trim_end();
561
562        // Empty line resets state
563        if trimmed.is_empty() {
564            current_options.clear();
565            continue;
566        }
567
568        // Section headers (no leading whitespace) and sub-section headers
569        match trimmed {
570            "GEM" => {
571                state = ParseState::Gem;
572                current_gem_type = "GEM".to_string();
573                current_remote = None;
574                current_options.clear();
575                continue;
576            }
577            "GIT" => {
578                state = ParseState::Git;
579                current_gem_type = "GIT".to_string();
580                current_remote = None;
581                current_options.clear();
582                continue;
583            }
584            "PATH" => {
585                state = ParseState::Path;
586                current_gem_type = "PATH".to_string();
587                current_remote = None;
588                current_options.clear();
589                continue;
590            }
591            "SVN" => {
592                state = ParseState::Svn;
593                current_gem_type = "SVN".to_string();
594                current_remote = None;
595                current_options.clear();
596                continue;
597            }
598            "PLATFORMS" => {
599                state = ParseState::Platforms;
600                continue;
601            }
602            "BUNDLED WITH" => {
603                state = ParseState::BundledWith;
604                continue;
605            }
606            "DEPENDENCIES" => {
607                state = ParseState::Dependencies;
608                continue;
609            }
610            _ => {}
611        }
612
613        // Check for "  specs:" sub-section header (2-space indent) within
614        // GEM/GIT/PATH/SVN sections. This must be checked separately because
615        // the leading whitespace is preserved by trim_end().
616        if trimmed.trim() == "specs:" {
617            state = match state {
618                ParseState::Gem | ParseState::Git | ParseState::Path | ParseState::Svn => {
619                    ParseState::Specs
620                }
621                _ => state,
622            };
623            continue;
624        }
625
626        // Process based on current state
627        match state {
628            ParseState::Gem | ParseState::Git | ParseState::Path | ParseState::Svn => {
629                // Parse options (remote:, revision:, ref:, branch:, tag:)
630                if let Some(caps) = options_regex.captures(line) {
631                    let key = caps.get(1).map(|m| m.as_str()).unwrap_or("");
632                    let value = caps.get(2).map(|m| m.as_str()).unwrap_or("");
633                    current_options.insert(key.to_string(), value.to_string());
634                    if key == "remote" {
635                        current_remote = Some(value.to_string());
636                    }
637                }
638            }
639            ParseState::Specs => {
640                // Parse gem specs (4 spaces indent)
641                if let Some(caps) = spec_deps_regex.captures(line) {
642                    let name = caps.get(1).map(|m| m.as_str()).unwrap_or("").to_string();
643                    let version_str = caps.get(2).map(|m| m.as_str()).unwrap_or("");
644
645                    // Parse version and platform
646                    let (version, platform) = parse_version_platform(version_str);
647
648                    if !name.is_empty() {
649                        let gem_info = GemInfo {
650                            name: name.clone(),
651                            version,
652                            platform,
653                            gem_type: current_gem_type.clone(),
654                            remote: current_remote.clone(),
655                            revision: current_options.get("revision").cloned(),
656                            ref_field: current_options.get("ref").cloned(),
657                            branch: current_options.get("branch").cloned(),
658                            tag: current_options.get("tag").cloned(),
659                            pinned: false,
660                            requirements: Vec::new(),
661                        };
662                        gems.insert(name, gem_info);
663                    }
664                }
665            }
666            ParseState::Platforms => {
667                // Parse platform entries (2 spaces indent)
668                let platform = trimmed.trim();
669                if !platform.is_empty() {
670                    platforms.push(platform.to_string());
671                }
672            }
673            ParseState::BundledWith => {
674                // Parse bundler version
675                if let Some(caps) = version_regex.captures(line) {
676                    bundler_version = caps.get(1).map(|m| m.as_str().to_string());
677                }
678            }
679            ParseState::Dependencies => {
680                // Parse direct dependencies (2 spaces indent)
681                if let Some(caps) = deps_regex.captures(line) {
682                    let name = caps.get(1).map(|m| m.as_str()).unwrap_or("").to_string();
683                    let version_constraint = caps.get(2).map(|m| m.as_str().to_string());
684                    let pinned = caps.get(3).is_some();
685
686                    if !name.is_empty() {
687                        // Update gem info if exists, or create new
688                        if let Some(gem) = gems.get_mut(&name) {
689                            gem.pinned = pinned;
690                            if let Some(vc) = &version_constraint {
691                                gem.requirements.push(vc.clone());
692                            }
693                        } else {
694                            let gem_info = GemInfo {
695                                name: name.clone(),
696                                version: None,
697                                platform: None,
698                                gem_type: "GEM".to_string(),
699                                remote: None,
700                                revision: None,
701                                ref_field: None,
702                                branch: None,
703                                tag: None,
704                                pinned,
705                                requirements: version_constraint.into_iter().collect(),
706                            };
707                            gems.insert(name, gem_info);
708                        }
709                    }
710                }
711            }
712            ParseState::None => {}
713        }
714    }
715
716    let primary_gem = select_primary_path_gem(&gems);
717
718    let (
719        package_name,
720        package_version,
721        repository_homepage_url,
722        repository_download_url,
723        api_data_url,
724        download_url,
725    ) = if let Some(ref pg) = primary_gem {
726        let urls = get_rubygems_urls(&pg.name, pg.version.as_deref(), pg.platform.as_deref());
727        (
728            Some(pg.name.clone()),
729            pg.version.clone(),
730            urls.0,
731            urls.1,
732            urls.2,
733            urls.3,
734        )
735    } else {
736        (None, None, None, None, None, None)
737    };
738
739    for (_, gem) in gems {
740        if let Some(ref pg) = primary_gem
741            && gem.name == pg.name
742        {
743            continue;
744        }
745
746        let version_for_purl = gem.version.as_deref();
747        let purl = create_gem_purl(&gem.name, version_for_purl);
748
749        let extracted_requirement = if !gem.requirements.is_empty() {
750            Some(gem.requirements.join(", "))
751        } else {
752            gem.version.clone()
753        };
754
755        let extra_data = build_gem_source_extra_data(&gem);
756
757        dependencies.push(Dependency {
758            purl,
759            extracted_requirement,
760            scope: Some("dependencies".to_string()),
761            is_runtime: Some(true),
762            is_optional: Some(false),
763            is_pinned: Some(gem.pinned),
764            is_direct: Some(true),
765            resolved_package: None,
766            extra_data,
767        });
768    }
769
770    dependencies.sort_by(|left, right| {
771        left.purl
772            .as_deref()
773            .cmp(&right.purl.as_deref())
774            .then_with(|| {
775                left.extracted_requirement
776                    .as_deref()
777                    .cmp(&right.extracted_requirement.as_deref())
778            })
779    });
780
781    // Build extra_data
782    let mut extra_data = HashMap::new();
783    if !platforms.is_empty() {
784        extra_data.insert(
785            "platforms".to_string(),
786            serde_json::Value::Array(
787                platforms
788                    .into_iter()
789                    .map(serde_json::Value::String)
790                    .collect(),
791            ),
792        );
793    }
794    if let Some(bv) = bundler_version {
795        extra_data.insert("bundler_version".to_string(), serde_json::Value::String(bv));
796    }
797
798    let purl = package_name
799        .as_deref()
800        .map(|n| create_gem_purl(n, package_version.as_deref()))
801        .unwrap_or(None);
802
803    PackageData {
804        package_type: Some(PACKAGE_TYPE),
805        name: package_name,
806        version: package_version,
807        primary_language: Some("Ruby".to_string()),
808        download_url,
809        dependencies,
810        repository_homepage_url,
811        repository_download_url,
812        api_data_url,
813        extra_data: if extra_data.is_empty() {
814            None
815        } else {
816            Some(extra_data)
817        },
818        datasource_id: Some(DatasourceId::GemfileLock),
819        purl,
820        ..default_package_data()
821    }
822}
823
824fn build_gem_source_extra_data(gem: &GemInfo) -> Option<HashMap<String, serde_json::Value>> {
825    if gem.gem_type != "GIT" && gem.gem_type != "PATH" && gem.gem_type != "SVN" {
826        return None;
827    }
828
829    let mut extra = HashMap::new();
830    extra.insert(
831        "source_type".to_string(),
832        serde_json::Value::String(gem.gem_type.clone()),
833    );
834
835    if let Some(ref remote) = gem.remote {
836        extra.insert(
837            "remote".to_string(),
838            serde_json::Value::String(remote.clone()),
839        );
840    }
841    if let Some(ref revision) = gem.revision {
842        extra.insert(
843            "revision".to_string(),
844            serde_json::Value::String(revision.clone()),
845        );
846    }
847    if let Some(ref ref_field) = gem.ref_field {
848        extra.insert(
849            "ref".to_string(),
850            serde_json::Value::String(ref_field.clone()),
851        );
852    }
853    if let Some(ref branch) = gem.branch {
854        extra.insert(
855            "branch".to_string(),
856            serde_json::Value::String(branch.clone()),
857        );
858    }
859    if let Some(ref tag) = gem.tag {
860        extra.insert("tag".to_string(), serde_json::Value::String(tag.clone()));
861    }
862
863    Some(extra)
864}
865
866/// Parses version and platform from a combined string.
867/// Examples: "2.6.3" -> ("2.6.3", None), "2.6.3-java" -> ("2.6.3", Some("java"))
868fn parse_version_platform(s: &str) -> (Option<String>, Option<String>) {
869    if s.is_empty() {
870        return (None, None);
871    }
872    if let Some(idx) = s.find('-') {
873        let version = &s[..idx];
874        let platform = &s[idx + 1..];
875        (Some(version.to_string()), Some(platform.to_string()))
876    } else {
877        (Some(s.to_string()), None)
878    }
879}
880
881/// Creates a gem PURL.
882fn create_gem_purl(name: &str, version: Option<&str>) -> Option<String> {
883    let mut purl = match PackageUrl::new(PACKAGE_TYPE.as_str(), name) {
884        Ok(p) => p,
885        Err(e) => {
886            warn!("Failed to create PURL for gem '{}': {}", name, e);
887            return None;
888        }
889    };
890
891    if let Some(v) = version
892        && let Err(e) = purl.with_version(v)
893    {
894        warn!("Failed to set version '{}' for gem '{}': {}", v, name, e);
895    }
896
897    Some(purl.to_string())
898}
899
900fn rubygems_homepage_url(name: &str, version: Option<&str>) -> Option<String> {
901    if name.is_empty() {
902        return None;
903    }
904
905    if let Some(v) = version {
906        let v = v.trim().trim_matches('/');
907        Some(format!("https://rubygems.org/gems/{}/versions/{}", name, v))
908    } else {
909        Some(format!("https://rubygems.org/gems/{}", name))
910    }
911}
912
913fn rubygems_download_url(
914    name: &str,
915    version: Option<&str>,
916    platform: Option<&str>,
917) -> Option<String> {
918    if name.is_empty() || version.is_none() {
919        return None;
920    }
921
922    let name = name.trim().trim_matches('/');
923    let version = version?.trim().trim_matches('/');
924
925    let version_plat = if let Some(p) = platform {
926        if p != "ruby" {
927            format!("{}-{}", version, p)
928        } else {
929            version.to_string()
930        }
931    } else {
932        version.to_string()
933    };
934
935    Some(format!(
936        "https://rubygems.org/downloads/{}-{}.gem",
937        name, version_plat
938    ))
939}
940
941fn rubygems_api_url(name: &str, version: Option<&str>) -> Option<String> {
942    if name.is_empty() {
943        return None;
944    }
945
946    if let Some(v) = version {
947        Some(format!(
948            "https://rubygems.org/api/v2/rubygems/{}/versions/{}.json",
949            name, v
950        ))
951    } else {
952        Some(format!(
953            "https://rubygems.org/api/v1/versions/{}.json",
954            name
955        ))
956    }
957}
958
959fn get_rubygems_urls(
960    name: &str,
961    version: Option<&str>,
962    platform: Option<&str>,
963) -> (
964    Option<String>,
965    Option<String>,
966    Option<String>,
967    Option<String>,
968) {
969    let repository_homepage_url = rubygems_homepage_url(name, version);
970    let repository_download_url = rubygems_download_url(name, version, platform);
971    let api_data_url = rubygems_api_url(name, version);
972    let download_url = repository_download_url.clone();
973
974    (
975        repository_homepage_url,
976        repository_download_url,
977        api_data_url,
978        download_url,
979    )
980}
981
982/// Returns a default PackageData with gem-specific settings.
983fn default_package_data() -> PackageData {
984    PackageData {
985        package_type: Some(PACKAGE_TYPE),
986        primary_language: Some("Ruby".to_string()),
987        ..Default::default()
988    }
989}
990
991fn default_package_data_with_datasource(datasource_id: DatasourceId) -> PackageData {
992    PackageData {
993        datasource_id: Some(datasource_id),
994        ..default_package_data()
995    }
996}
997
998// =============================================================================
999// Gemspec Parser (Ruby DSL)
1000// =============================================================================
1001
1002/// Ruby .gemspec file parser.
1003///
1004/// Parses `Gem::Specification.new` blocks using regex-based extraction.
1005/// Handles frozen strings (Bug #1), variable version resolution (Bug #2),
1006/// and RFC 5322 email parsing (Bug #6).
1007pub struct GemspecParser;
1008
1009impl PackageParser for GemspecParser {
1010    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
1011
1012    fn metadata() -> Vec<ParserMetadata> {
1013        vec![ParserMetadata {
1014            description: "Ruby .gemspec manifest",
1015            file_patterns: &[
1016                "**/*.gemspec",
1017                "**/data.gz-extract/*.gemspec",
1018                "**/specifications/*.gemspec",
1019            ],
1020            package_type: "gem",
1021            primary_language: "Ruby",
1022            documentation_url: Some("https://guides.rubygems.org/specification-reference/"),
1023        }]
1024    }
1025
1026    fn extract_packages(path: &Path) -> Vec<PackageData> {
1027        let datasource_id = gemspec_datasource_id(path);
1028        let content = match read_file_to_string(path, None) {
1029            Ok(c) => c,
1030            Err(e) => {
1031                warn!("Failed to read .gemspec at {:?}: {}", path, e);
1032                return vec![default_package_data_with_datasource(datasource_id)];
1033            }
1034        };
1035
1036        let mut package_data = parse_gemspec_with_context(&content, path.parent());
1037        package_data.datasource_id = Some(datasource_id);
1038        vec![package_data]
1039    }
1040
1041    fn is_match(path: &Path) -> bool {
1042        path.extension()
1043            .and_then(|ext| ext.to_str())
1044            .is_some_and(|ext| ext == "gemspec")
1045    }
1046}
1047
1048fn normalized_ruby_path(path: &Path) -> String {
1049    path.to_string_lossy().replace('\\', "/")
1050}
1051
1052fn gemfile_datasource_id(path: &Path) -> DatasourceId {
1053    if normalized_ruby_path(path).contains("/data.gz-extract/") {
1054        DatasourceId::GemfileExtracted
1055    } else {
1056        DatasourceId::Gemfile
1057    }
1058}
1059
1060fn gemfile_lock_datasource_id(path: &Path) -> DatasourceId {
1061    if normalized_ruby_path(path).contains("/data.gz-extract/") {
1062        DatasourceId::GemfileLockExtracted
1063    } else {
1064        DatasourceId::GemfileLock
1065    }
1066}
1067
1068fn gemspec_datasource_id(path: &Path) -> DatasourceId {
1069    let normalized = normalized_ruby_path(path);
1070    if normalized.contains("/data.gz-extract/") {
1071        DatasourceId::GemspecExtracted
1072    } else if normalized.contains("/specifications/") {
1073        DatasourceId::GemGemspecInstalledSpecifications
1074    } else {
1075        DatasourceId::Gemspec
1076    }
1077}
1078
1079/// Cleans a value extracted from gemspec by stripping quotes, .freeze, %q{}, and brackets.
1080fn clean_gemspec_value(s: &str) -> String {
1081    let s = strip_freeze_suffix(s).trim();
1082
1083    let s = if let Some(pos) = s.find(" #") {
1084        s[..pos].trim()
1085    } else {
1086        s
1087    };
1088
1089    let s = if let Some(stripped) = s.strip_prefix("%q{") {
1090        stripped.strip_suffix('}').unwrap_or(stripped)
1091    } else if let Some(stripped) = s.strip_prefix("%q<") {
1092        stripped.strip_suffix('>').unwrap_or(stripped)
1093    } else if let Some(stripped) = s.strip_prefix("%q[") {
1094        stripped.strip_suffix(']').unwrap_or(stripped)
1095    } else if let Some(stripped) = s.strip_prefix("%q(") {
1096        stripped.strip_suffix(')').unwrap_or(stripped)
1097    } else {
1098        s
1099    };
1100
1101    let s = s
1102        .trim_start_matches('"')
1103        .trim_end_matches('"')
1104        .trim_start_matches('\'')
1105        .trim_end_matches('\'');
1106    let s = strip_freeze_suffix(s).trim();
1107    s.to_string()
1108}
1109
1110/// Extracts items from a Ruby array literal like `["a", "b", "c"]`.
1111fn extract_ruby_array(s: &str) -> Vec<String> {
1112    let s = strip_freeze_suffix(s.trim());
1113    let s = s.trim_start_matches('[').trim_end_matches(']');
1114    let item_re = match Regex::new(r#"["']([^"']*?)["'](?:\.freeze)?"#) {
1115        Ok(r) => r,
1116        Err(_) => return Vec::new(),
1117    };
1118    item_re
1119        .captures_iter(s)
1120        .filter_map(|cap| cap.get(1).map(|m| m.as_str().to_string()))
1121        .collect()
1122}
1123
1124fn extract_all_ruby_values(s: &str) -> Vec<String> {
1125    let value_re = match Regex::new(r#"%q[\{<\[(]([^\}>\])]+)[\}>\])]|["']([^"']+)["']"#) {
1126        Ok(r) => r,
1127        Err(_) => return Vec::new(),
1128    };
1129
1130    value_re
1131        .captures_iter(s)
1132        .filter_map(|caps| caps.get(1).or_else(|| caps.get(2)))
1133        .map(|m| clean_gemspec_value(m.as_str()))
1134        .collect()
1135}
1136
1137fn extract_first_ruby_value(s: &str) -> Option<String> {
1138    extract_all_ruby_values(s).into_iter().next()
1139}
1140
1141fn after_first_argument(args: &str) -> &str {
1142    let mut bracket_depth = 0usize;
1143    let mut paren_depth = 0usize;
1144    let mut in_quote: Option<char> = None;
1145    let chars: Vec<(usize, char)> = args.char_indices().collect();
1146    let mut i = 0;
1147
1148    while i < chars.len() {
1149        let (idx, ch) = chars[i];
1150
1151        if let Some(quote) = in_quote {
1152            if ch == '\\' {
1153                i += 2;
1154                continue;
1155            }
1156            if ch == quote {
1157                in_quote = None;
1158            }
1159            i += 1;
1160            continue;
1161        }
1162
1163        match ch {
1164            '\'' | '"' => in_quote = Some(ch),
1165            '[' | '{' | '<' => bracket_depth += 1,
1166            ']' | '}' | '>' => bracket_depth = bracket_depth.saturating_sub(1),
1167            '(' => paren_depth += 1,
1168            ')' => paren_depth = paren_depth.saturating_sub(1),
1169            ',' if bracket_depth == 0 && paren_depth == 0 => return args[idx + 1..].trim(),
1170            _ => {}
1171        }
1172
1173        i += 1;
1174    }
1175
1176    ""
1177}
1178
1179/// Bug #2: Resolves variable version references like `CSV::VERSION` or `RAILS_VERSION`.
1180///
1181/// Scans the file content for constant definitions matching the variable name
1182/// and returns the resolved string value.
1183fn resolve_variable_version(var_name: &str, contexts: &[String]) -> Option<String> {
1184    let var_name = var_name.trim();
1185    if var_name.is_empty() {
1186        return None;
1187    }
1188
1189    for candidate in candidate_constant_names(var_name) {
1190        let escaped = regex::escape(&candidate);
1191        let pattern = format!(r#"(?m)^\s*{}\s*=\s*(.+)$"#, escaped);
1192        let Ok(re) = Regex::new(&pattern) else {
1193            continue;
1194        };
1195
1196        for context in contexts {
1197            if let Some(caps) = re.captures(context)
1198                && let Some(expression) = caps.get(1)
1199                && let Some(resolved) =
1200                    resolve_scalar_expression(expression.as_str(), None, contexts)
1201            {
1202                return Some(resolved);
1203            }
1204        }
1205    }
1206
1207    None
1208}
1209
1210fn resolve_variable_array(var_name: &str, contexts: &[String]) -> Option<Vec<String>> {
1211    let var_name = var_name.trim();
1212    if var_name.is_empty() {
1213        return None;
1214    }
1215
1216    for candidate in candidate_constant_names(var_name) {
1217        let escaped = regex::escape(&candidate);
1218        let pattern = format!(r#"(?m)^\s*{}\s*=\s*(\[[^\n]+\])"#, escaped);
1219        let Ok(re) = Regex::new(&pattern) else {
1220            continue;
1221        };
1222
1223        for context in contexts {
1224            if let Some(caps) = re.captures(context)
1225                && let Some(raw) = caps.get(1)
1226            {
1227                let values = extract_ruby_array(raw.as_str());
1228                if !values.is_empty() {
1229                    return Some(values);
1230                }
1231            }
1232        }
1233    }
1234
1235    None
1236}
1237
1238fn candidate_constant_names(var_name: &str) -> Vec<String> {
1239    let mut names = vec![var_name.to_string()];
1240    if let Some(last) = var_name.split("::").last()
1241        && last != var_name
1242    {
1243        names.push(last.to_string());
1244    }
1245    names
1246}
1247
1248fn looks_like_local_variable_reference(s: &str) -> bool {
1249    let mut chars = s.chars();
1250    matches!(chars.next(), Some('_' | 'a'..='z'))
1251        && chars.all(|c| c == '_' || c.is_ascii_alphanumeric())
1252}
1253
1254fn resolve_ruby_read_root(base_dir: Option<&Path>) -> Option<PathBuf> {
1255    let base_dir = base_dir?;
1256    let current_dir = std::env::current_dir().ok();
1257
1258    current_dir
1259        .and_then(|cwd| {
1260            let canonical_cwd = cwd.canonicalize().ok()?;
1261            let canonical_base = base_dir.canonicalize().ok()?;
1262            canonical_base
1263                .starts_with(&canonical_cwd)
1264                .then_some(canonical_cwd)
1265        })
1266        .or_else(|| base_dir.canonicalize().ok())
1267}
1268
1269fn resolve_ruby_read_path(path: PathBuf, allowed_root: &Path) -> Option<PathBuf> {
1270    let canonical_path = path.canonicalize().ok()?;
1271    canonical_path
1272        .starts_with(allowed_root)
1273        .then_some(canonical_path)
1274}
1275
1276fn resolve_file_read_argument(args: &str, base_dir: Option<&Path>) -> Option<String> {
1277    let base_dir = base_dir?;
1278    let allowed_root = resolve_ruby_read_root(base_dir.into())?;
1279    let relative_path = extract_first_ruby_value(args)?;
1280    if relative_path.is_empty() {
1281        return None;
1282    }
1283
1284    let candidate = Path::new(&relative_path);
1285    let path = if candidate.is_absolute() {
1286        candidate.to_path_buf()
1287    } else {
1288        base_dir.join(candidate)
1289    };
1290
1291    let safe_path = resolve_ruby_read_path(path, &allowed_root)?;
1292
1293    fs::read_to_string(safe_path)
1294        .ok()
1295        .map(|content| content.trim().to_string())
1296        .filter(|content| !content.is_empty())
1297}
1298
1299fn resolve_scalar_expression(
1300    expression: &str,
1301    base_dir: Option<&Path>,
1302    contexts: &[String],
1303) -> Option<String> {
1304    let expression = if let Some(pos) = expression.find(" #") {
1305        expression[..pos].trim()
1306    } else {
1307        expression.trim()
1308    };
1309
1310    let file_read_re = Regex::new(r#"^File\.read\((.+)\)(?:\.strip)?(?:\.freeze)?$"#).ok()?;
1311    if let Some(caps) = file_read_re.captures(expression) {
1312        return caps
1313            .get(1)
1314            .and_then(|m| resolve_file_read_argument(m.as_str(), base_dir));
1315    }
1316
1317    if let Some(joined) = resolve_joined_constant_string(expression, contexts) {
1318        return Some(joined);
1319    }
1320
1321    if let Some(value) = extract_first_ruby_value(expression) {
1322        return Some(interpolate_ruby_constant_string(&value, contexts));
1323    }
1324
1325    let cleaned = clean_gemspec_value(expression);
1326    if looks_like_constant_reference(&cleaned) {
1327        return resolve_variable_version(&cleaned, contexts).or(Some(cleaned));
1328    }
1329
1330    None
1331}
1332
1333fn resolve_joined_constant_string(expression: &str, contexts: &[String]) -> Option<String> {
1334    let expression = strip_freeze_suffix(expression.trim());
1335    if !expression.starts_with('[') {
1336        return None;
1337    }
1338    let join_index = expression.find("].join(")?;
1339    let body = &expression[1..join_index];
1340    let separator_expr = expression[join_index + 7..].strip_suffix(')')?.trim();
1341    let separator = extract_first_ruby_value(separator_expr)?;
1342
1343    let mut parts = Vec::new();
1344    for item in body.split(',').take(MAX_ITERATION_COUNT) {
1345        let resolved = resolve_scalar_expression(item.trim(), None, contexts)?;
1346        parts.push(resolved);
1347    }
1348
1349    Some(parts.join(&separator))
1350}
1351
1352fn interpolate_ruby_constant_string(value: &str, contexts: &[String]) -> String {
1353    if !value.contains("#{") {
1354        return value.to_string();
1355    }
1356
1357    let Ok(interpolation_re) = Regex::new(r#"#\{([^}]+)\}"#) else {
1358        return value.to_string();
1359    };
1360    interpolation_re
1361        .replace_all(value, |captures: &regex::Captures<'_>| {
1362            let reference = captures
1363                .get(1)
1364                .map(|m| m.as_str().trim())
1365                .unwrap_or_default();
1366            resolve_variable_version(reference, contexts).unwrap_or_else(|| {
1367                captures
1368                    .get(0)
1369                    .map(|value| value.as_str().to_string())
1370                    .unwrap_or_default()
1371            })
1372        })
1373        .into_owned()
1374}
1375
1376fn resolve_local_variable_value(
1377    var_name: &str,
1378    content: &str,
1379    base_dir: Option<&Path>,
1380    contexts: &[String],
1381) -> Option<String> {
1382    let escaped = regex::escape(var_name.trim());
1383    let pattern = format!(r#"(?m)^\s*{}\s*=\s*(.+)$"#, escaped);
1384    let re = Regex::new(&pattern).ok()?;
1385
1386    re.captures_iter(content).find_map(|caps| {
1387        caps.get(1)
1388            .and_then(|m| resolve_scalar_expression(m.as_str(), base_dir, contexts))
1389    })
1390}
1391
1392fn resolve_gemspec_scalar_value(
1393    raw_value: &str,
1394    content: &str,
1395    base_dir: Option<&Path>,
1396    contexts: &[String],
1397) -> Option<String> {
1398    let cleaned = truncate_field(clean_gemspec_value(raw_value));
1399    if cleaned.is_empty() {
1400        return None;
1401    }
1402
1403    if looks_like_constant_reference(&cleaned) {
1404        return resolve_variable_version(&cleaned, contexts)
1405            .map(truncate_field)
1406            .or(Some(cleaned));
1407    }
1408
1409    if looks_like_local_variable_reference(&cleaned) {
1410        return resolve_local_variable_value(&cleaned, content, base_dir, contexts)
1411            .map(truncate_field)
1412            .or(Some(cleaned));
1413    }
1414
1415    Some(cleaned)
1416}
1417
1418fn load_required_ruby_contexts(content: &str, base_dir: Option<&Path>) -> Vec<String> {
1419    let mut contexts = vec![content.to_string()];
1420    let Some(base_dir) = base_dir else {
1421        return contexts;
1422    };
1423    let allowed_root = resolve_ruby_read_root(Some(base_dir));
1424
1425    let require_re = match Regex::new(r#"(?m)^\s*require(?:_relative)?\s+["']([^"']+)["']"#) {
1426        Ok(re) => re,
1427        Err(_) => return contexts,
1428    };
1429
1430    for caps in require_re.captures_iter(content) {
1431        let Some(required) = caps.get(1).map(|m| m.as_str()) else {
1432            continue;
1433        };
1434        for candidate in candidate_require_paths(base_dir, required) {
1435            let Some(safe_candidate) = allowed_root
1436                .as_deref()
1437                .and_then(|root| resolve_ruby_read_path(candidate, root))
1438            else {
1439                continue;
1440            };
1441            if let Ok(required_content) = read_file_to_string(&safe_candidate, None) {
1442                contexts.push(required_content);
1443                break;
1444            }
1445        }
1446    }
1447
1448    contexts
1449}
1450
1451fn candidate_require_paths(base_dir: &Path, required: &str) -> Vec<PathBuf> {
1452    let relative = required.replace("::", "/");
1453    let filename = if relative.ends_with(".rb") {
1454        relative
1455    } else {
1456        format!("{}.rb", relative)
1457    };
1458
1459    vec![
1460        base_dir.join(&filename),
1461        base_dir.join("lib").join(&filename),
1462    ]
1463}
1464
1465fn looks_like_constant_reference(s: &str) -> bool {
1466    s.contains("::") || s.chars().next().is_some_and(|c| c.is_ascii_uppercase())
1467}
1468
1469/// Parses a .gemspec file content and returns PackageData.
1470#[cfg(test)]
1471fn parse_gemspec(content: &str) -> PackageData {
1472    parse_gemspec_with_context(content, None)
1473}
1474
1475fn parse_gemspec_with_context(content: &str, base_dir: Option<&Path>) -> PackageData {
1476    let contexts = load_required_ruby_contexts(content, base_dir);
1477
1478    // Regex for spec.name = "value" or s.name = "value"
1479    // The spec variable name varies: spec, s, gem, etc.
1480    let field_re = match Regex::new(
1481        r#"(?m)^\s*\w+\.(name|version|summary|description|homepage|license)\s*=\s*(.+)$"#,
1482    ) {
1483        Ok(r) => r,
1484        Err(e) => {
1485            warn!("Failed to compile gemspec field regex: {}", e);
1486            return default_package_data_with_datasource(DatasourceId::Gemspec);
1487        }
1488    };
1489
1490    let licenses_re = match Regex::new(r#"(?m)^\s*\w+\.licenses\s*=\s*(.+)$"#) {
1491        Ok(r) => r,
1492        Err(e) => {
1493            warn!("Failed to compile licenses regex: {}", e);
1494            return default_package_data_with_datasource(DatasourceId::Gemspec);
1495        }
1496    };
1497
1498    let authors_re = match Regex::new(r#"(?m)^\s*\w+\.(?:authors|author)\s*=\s*(.+)$"#) {
1499        Ok(r) => r,
1500        Err(e) => {
1501            warn!("Failed to compile authors regex: {}", e);
1502            return default_package_data_with_datasource(DatasourceId::Gemspec);
1503        }
1504    };
1505
1506    let email_re = match Regex::new(r#"(?m)^\s*\w+\.email\s*=\s*(.+)$"#) {
1507        Ok(r) => r,
1508        Err(e) => {
1509            warn!("Failed to compile email regex: {}", e);
1510            return default_package_data_with_datasource(DatasourceId::Gemspec);
1511        }
1512    };
1513
1514    let dependency_call_re = match Regex::new(
1515        r#"(?m)^\s*\w+\.(add_(?:development_|runtime_)?dependency)\s*\(?(.+?)\)?\s*$"#,
1516    ) {
1517        Ok(r) => r,
1518        Err(e) => {
1519            warn!("Failed to compile gemspec dependency regex: {}", e);
1520            return default_package_data_with_datasource(DatasourceId::Gemspec);
1521        }
1522    };
1523
1524    let mut name: Option<String> = None;
1525    let mut version: Option<String> = None;
1526    let mut summary: Option<String> = None;
1527    let mut description: Option<String> = None;
1528    let mut homepage: Option<String> = None;
1529    let mut license: Option<String> = None;
1530    let mut licenses: Vec<String> = Vec::new();
1531    let mut authors: Vec<String> = Vec::new();
1532    let mut emails: Vec<String> = Vec::new();
1533    let mut dependencies: Vec<Dependency> = Vec::new();
1534
1535    // Extract basic fields
1536    for caps in field_re.captures_iter(content).take(MAX_ITERATION_COUNT) {
1537        let field_name = match caps.get(1) {
1538            Some(m) => m.as_str(),
1539            None => continue,
1540        };
1541        let raw_value = match caps.get(2) {
1542            Some(m) => m.as_str().trim(),
1543            None => continue,
1544        };
1545
1546        match field_name {
1547            "name" => name = resolve_gemspec_scalar_value(raw_value, content, base_dir, &contexts),
1548            "version" => {
1549                version = resolve_gemspec_scalar_value(raw_value, content, base_dir, &contexts);
1550            }
1551            "summary" => {
1552                summary = resolve_gemspec_scalar_value(raw_value, content, base_dir, &contexts)
1553            }
1554            "description" => description = Some(truncate_field(clean_gemspec_value(raw_value))),
1555            "homepage" => {
1556                homepage = resolve_gemspec_scalar_value(raw_value, content, base_dir, &contexts)
1557            }
1558            "license" => license = Some(truncate_field(clean_gemspec_value(raw_value))),
1559            _ => {}
1560        }
1561    }
1562
1563    // Extract licenses (plural)
1564    for caps in licenses_re.captures_iter(content).take(MAX_ITERATION_COUNT) {
1565        if let Some(raw) = caps.get(1) {
1566            licenses = extract_ruby_array(raw.as_str());
1567        }
1568    }
1569
1570    // Extract authors
1571    for caps in authors_re.captures_iter(content).take(MAX_ITERATION_COUNT) {
1572        if let Some(raw) = caps.get(1) {
1573            let raw_str = raw.as_str().trim();
1574            if raw_str.starts_with('[') {
1575                authors = extract_ruby_array(raw_str);
1576            } else if looks_like_constant_reference(raw_str) {
1577                authors = resolve_variable_array(raw_str, &contexts)
1578                    .unwrap_or_else(|| vec![clean_gemspec_value(raw_str)]);
1579            } else {
1580                authors.push(clean_gemspec_value(raw_str));
1581            }
1582        }
1583    }
1584
1585    // Extract emails
1586    for caps in email_re.captures_iter(content).take(MAX_ITERATION_COUNT) {
1587        if let Some(raw) = caps.get(1) {
1588            let raw_str = raw.as_str().trim();
1589            if raw_str.starts_with('[') {
1590                emails = extract_ruby_array(raw_str);
1591            } else if looks_like_constant_reference(raw_str) {
1592                emails = resolve_variable_array(raw_str, &contexts)
1593                    .unwrap_or_else(|| vec![clean_gemspec_value(raw_str)]);
1594            } else {
1595                emails.push(clean_gemspec_value(raw_str));
1596            }
1597        }
1598    }
1599
1600    // Build parties from authors and emails
1601    let mut parties: Vec<Party> = Vec::new();
1602
1603    if authors.len() == 1 && emails.len() == 1 {
1604        let email_str = emails.first().map(String::as_str);
1605        let (parsed_email_name, parsed_email) = match email_str {
1606            Some(e) => split_name_email(e),
1607            None => (None, None),
1608        };
1609
1610        parties.push(Party {
1611            r#type: Some("person".to_string()),
1612            role: Some("author".to_string()),
1613            name: authors.first().cloned().or(parsed_email_name),
1614            email: parsed_email.or_else(|| {
1615                email_str
1616                    .filter(|e| e.contains('@') && !e.contains('<'))
1617                    .map(|e| e.to_string())
1618            }),
1619            url: None,
1620            organization: None,
1621            organization_url: None,
1622            timezone: None,
1623        });
1624    } else {
1625        for author_name in authors {
1626            parties.push(Party {
1627                r#type: Some("person".to_string()),
1628                role: Some("author".to_string()),
1629                name: Some(author_name),
1630                email: None,
1631                url: None,
1632                organization: None,
1633                organization_url: None,
1634                timezone: None,
1635            });
1636        }
1637
1638        for email_str in emails {
1639            let (parsed_email_name, parsed_email) = if email_str.contains('<') {
1640                split_name_email(&email_str)
1641            } else {
1642                (None, None)
1643            };
1644            parties.push(Party {
1645                r#type: Some("person".to_string()),
1646                role: Some("author".to_string()),
1647                name: parsed_email_name,
1648                email: parsed_email.or_else(|| email_str.contains('@').then_some(email_str)),
1649                url: None,
1650                organization: None,
1651                organization_url: None,
1652                timezone: None,
1653            });
1654        }
1655    }
1656
1657    for caps in dependency_call_re
1658        .captures_iter(content)
1659        .take(MAX_ITERATION_COUNT)
1660    {
1661        let method = match caps.get(1) {
1662            Some(m) => m.as_str(),
1663            None => continue,
1664        };
1665        let args = match caps.get(2) {
1666            Some(m) => m.as_str(),
1667            None => continue,
1668        };
1669
1670        let Some(dep_name) = extract_first_ruby_value(args).map(truncate_field) else {
1671            continue;
1672        };
1673        let version_parts = extract_all_ruby_values(after_first_argument(args));
1674        let extracted_requirement = if version_parts.is_empty() {
1675            None
1676        } else {
1677            Some(version_parts.join(", "))
1678        };
1679        let purl = create_gem_purl(&dep_name, None);
1680        let is_development = method == "add_development_dependency";
1681        let scope = if is_development {
1682            "development"
1683        } else {
1684            "runtime"
1685        };
1686
1687        dependencies.push(Dependency {
1688            purl,
1689            extracted_requirement,
1690            scope: Some(scope.to_string()),
1691            is_runtime: Some(!is_development),
1692            is_optional: Some(is_development),
1693            is_pinned: None,
1694            is_direct: Some(true),
1695            resolved_package: None,
1696            extra_data: None,
1697        });
1698    }
1699
1700    // Extract license statement only - detection happens in separate engine
1701    let extracted_license_statement = if !licenses.is_empty() {
1702        Some(licenses.join(" AND "))
1703    } else {
1704        license
1705    };
1706
1707    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
1708        normalize_spdx_declared_license(extracted_license_statement.as_deref());
1709
1710    // Prefer description over summary
1711    let final_description = description.or(summary);
1712
1713    // Build PURL
1714    let purl = name
1715        .as_deref()
1716        .map(|n| create_gem_purl(n, version.as_deref()))
1717        .unwrap_or(None);
1718
1719    let (repository_homepage_url, repository_download_url, api_data_url, download_url) =
1720        if let Some(n) = name.as_deref() {
1721            get_rubygems_urls(n, version.as_deref(), None)
1722        } else {
1723            (None, None, None, None)
1724        };
1725
1726    PackageData {
1727        package_type: Some(PACKAGE_TYPE),
1728        name,
1729        version,
1730        primary_language: Some("Ruby".to_string()),
1731        description: final_description,
1732        homepage_url: homepage,
1733        download_url,
1734        declared_license_expression,
1735        declared_license_expression_spdx,
1736        license_detections,
1737        extracted_license_statement,
1738        parties,
1739        dependencies,
1740        repository_homepage_url,
1741        repository_download_url,
1742        api_data_url,
1743        datasource_id: Some(DatasourceId::Gemspec),
1744        purl,
1745        ..default_package_data()
1746    }
1747}
1748
1749// =============================================================================
1750// .gem Archive Parser (Wave 3)
1751// =============================================================================
1752
1753const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; // 100MB
1754const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; // 50MB per file
1755const MAX_COMPRESSION_RATIO: f64 = 100.0; // 100:1 ratio
1756
1757/// Parser for .gem archive files.
1758///
1759/// Extracts metadata from Ruby .gem packages, which are tar archives
1760/// containing a gzip-compressed YAML metadata file (`metadata.gz`).
1761///
1762/// Includes safety checks against zip bombs and oversized archives.
1763pub struct GemArchiveParser;
1764
1765impl PackageParser for GemArchiveParser {
1766    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
1767
1768    fn metadata() -> Vec<ParserMetadata> {
1769        vec![ParserMetadata {
1770            description: "Ruby .gem archive",
1771            file_patterns: &["**/*.gem"],
1772            package_type: "gem",
1773            primary_language: "Ruby",
1774            documentation_url: Some("https://guides.rubygems.org/specification-reference/"),
1775        }]
1776    }
1777
1778    fn extract_packages(path: &Path) -> Vec<PackageData> {
1779        vec![match extract_gem_archive(path) {
1780            Ok(data) => data,
1781            Err(e) => {
1782                warn!("Failed to extract .gem archive at {:?}: {}", path, e);
1783                default_package_data_with_datasource(DatasourceId::GemArchive)
1784            }
1785        }]
1786    }
1787
1788    fn is_match(path: &Path) -> bool {
1789        path.extension()
1790            .and_then(|ext| ext.to_str())
1791            .is_some_and(|ext| ext == "gem")
1792    }
1793}
1794
1795fn extract_gem_archive(path: &Path) -> Result<PackageData, String> {
1796    let file_metadata =
1797        fs::metadata(path).map_err(|e| format!("Failed to read file metadata: {}", e))?;
1798    let archive_size = file_metadata.len();
1799
1800    if archive_size > MAX_ARCHIVE_SIZE {
1801        return Err(format!(
1802            "Archive too large: {} bytes (limit: {} bytes)",
1803            archive_size, MAX_ARCHIVE_SIZE
1804        ));
1805    }
1806
1807    let file = File::open(path).map_err(|e| format!("Failed to open archive: {}", e))?;
1808    let mut archive = Archive::new(file);
1809
1810    let mut entry_count: usize = 0;
1811    for entry_result in archive
1812        .entries()
1813        .map_err(|e| format!("Failed to read tar entries: {}", e))?
1814    {
1815        entry_count += 1;
1816        if entry_count > MAX_ITERATION_COUNT {
1817            warn!(
1818                "Exceeded max tar entry count ({}) in .gem archive, stopping iteration",
1819                MAX_ITERATION_COUNT
1820            );
1821            break;
1822        }
1823
1824        let entry = entry_result.map_err(|e| format!("Failed to read tar entry: {}", e))?;
1825        let entry_path = entry
1826            .path()
1827            .map_err(|e| format!("Failed to get entry path: {}", e))?;
1828        let entry_str = entry_path.to_string_lossy();
1829        if entry_str.contains("..") {
1830            warn!("Skipping tar entry with path traversal: {}", entry_str);
1831            continue;
1832        }
1833
1834        if entry_path.to_str() == Some("metadata.gz") {
1835            let entry_size = entry.size();
1836            if entry_size > MAX_FILE_SIZE {
1837                return Err(format!(
1838                    "metadata.gz too large: {} bytes (limit: {} bytes)",
1839                    entry_size, MAX_FILE_SIZE
1840                ));
1841            }
1842
1843            let mut decoder = GzDecoder::new(entry);
1844            let mut content = Vec::new();
1845            let mut limited = std::io::Read::take(&mut decoder, MAX_FILE_SIZE + 1);
1846            limited
1847                .read_to_end(&mut content)
1848                .map_err(|e| format!("Failed to decompress metadata.gz: {}", e))?;
1849
1850            if content.len() > MAX_FILE_SIZE as usize {
1851                return Err(format!(
1852                    "Decompressed metadata too large: exceeds {} byte limit",
1853                    MAX_FILE_SIZE
1854                ));
1855            }
1856
1857            let content = match String::from_utf8(content) {
1858                Ok(s) => s,
1859                Err(err) => {
1860                    let bytes = err.into_bytes();
1861                    warn!("Invalid UTF-8 in gem metadata; using lossy conversion");
1862                    String::from_utf8_lossy(&bytes).into_owned()
1863                }
1864            };
1865
1866            let uncompressed_size = content.len() as u64;
1867            if entry_size > 0 {
1868                let ratio = uncompressed_size as f64 / entry_size as f64;
1869                if ratio > MAX_COMPRESSION_RATIO {
1870                    return Err(format!(
1871                        "Suspicious compression ratio: {:.2}:1 (limit: {:.0}:1)",
1872                        ratio, MAX_COMPRESSION_RATIO
1873                    ));
1874                }
1875            }
1876
1877            return parse_gem_metadata_yaml(&content, DatasourceId::GemArchive);
1878        }
1879    }
1880
1881    Err("metadata.gz not found in .gem archive".to_string())
1882}
1883
1884fn parse_gem_metadata_yaml(
1885    content: &str,
1886    datasource_id: DatasourceId,
1887) -> Result<PackageData, String> {
1888    // Ruby YAML tagged types need to be handled:
1889    // --- !ruby/object:Gem::Specification
1890    // We strip Ruby-specific YAML tags since yaml_serde can't handle them
1891    let cleaned = clean_ruby_yaml_tags(content);
1892
1893    let yaml: yaml_serde::Value =
1894        yaml_serde::from_str(&cleaned).map_err(|e| format!("Failed to parse YAML: {}", e))?;
1895
1896    let name = yaml_string(&yaml, "name").map(truncate_field);
1897    let version = yaml.get("version").and_then(|v| {
1898        if v.is_string() {
1899            v.as_str().map(|s| truncate_field(s.to_string()))
1900        } else {
1901            yaml_string(v, "version").map(truncate_field)
1902        }
1903    });
1904    let description = yaml_string(&yaml, "description")
1905        .or_else(|| yaml_string(&yaml, "summary"))
1906        .map(truncate_field);
1907    let homepage = yaml_string(&yaml, "homepage").map(truncate_field);
1908    let summary = yaml_string(&yaml, "summary").map(truncate_field);
1909
1910    // Licenses
1911    let licenses: Vec<String> = yaml
1912        .get("licenses")
1913        .and_then(|v| v.as_sequence())
1914        .map(|seq| {
1915            seq.iter()
1916                .filter_map(|item| item.as_str().map(|s| truncate_field(s.to_string())))
1917                .collect()
1918        })
1919        .unwrap_or_default();
1920
1921    // Extract license statement only - detection happens in separate engine
1922    let extracted_license_statement = if !licenses.is_empty() {
1923        Some(licenses.join(" AND "))
1924    } else {
1925        None
1926    };
1927
1928    let (license_expression, license_expression_spdx, license_detections) =
1929        normalize_spdx_declared_license(extracted_license_statement.as_deref());
1930
1931    // Authors
1932    let authors: Vec<String> = yaml
1933        .get("authors")
1934        .and_then(|v| v.as_sequence())
1935        .map(|seq| {
1936            seq.iter()
1937                .filter_map(|item| item.as_str().map(|s| truncate_field(s.to_string())))
1938                .collect()
1939        })
1940        .unwrap_or_default();
1941
1942    let emails: Vec<String> = yaml
1943        .get("email")
1944        .map(|v| {
1945            if let Some(seq) = v.as_sequence() {
1946                seq.iter()
1947                    .filter_map(|item| item.as_str().map(|s| truncate_field(s.to_string())))
1948                    .collect()
1949            } else if let Some(s) = v.as_str() {
1950                vec![truncate_field(s.to_string())]
1951            } else {
1952                Vec::new()
1953            }
1954        })
1955        .unwrap_or_default();
1956
1957    // Build parties
1958    let mut parties: Vec<Party> = Vec::new();
1959    let max_len = authors.len().max(emails.len());
1960    for i in 0..max_len {
1961        let author_name = authors.get(i).map(|s| s.as_str());
1962        let email_str = emails.get(i).map(|s| s.as_str());
1963
1964        let (parsed_email_name, parsed_email) = match email_str {
1965            Some(e) if e.contains('<') => split_name_email(e),
1966            None => (None, None),
1967            _ => (None, None),
1968        };
1969
1970        let party_name = author_name.map(|s| s.to_string()).or(parsed_email_name);
1971
1972        parties.push(Party {
1973            r#type: Some("person".to_string()),
1974            role: Some("author".to_string()),
1975            name: party_name,
1976            email: parsed_email.or_else(|| {
1977                email_str
1978                    .filter(|e| e.contains('@') && !e.contains('<'))
1979                    .map(|e| e.to_string())
1980            }),
1981            url: None,
1982            organization: None,
1983            organization_url: None,
1984            timezone: None,
1985        });
1986    }
1987
1988    // Dependencies
1989    let dependencies = parse_gem_yaml_dependencies(&yaml);
1990
1991    let metadata = yaml.get("metadata");
1992
1993    let bug_tracking_url = metadata
1994        .and_then(|m| yaml_string(m, "bug_tracking_uri"))
1995        .map(truncate_field);
1996
1997    let code_view_url = metadata
1998        .and_then(|m| yaml_string(m, "source_code_uri"))
1999        .map(truncate_field);
2000
2001    let vcs_url = code_view_url.clone().or_else(|| {
2002        metadata
2003            .and_then(|m| yaml_string(m, "homepage_uri"))
2004            .map(truncate_field)
2005    });
2006
2007    let file_references = metadata
2008        .and_then(|m| m.get("files"))
2009        .and_then(|f| f.as_sequence())
2010        .map(|seq| {
2011            seq.iter()
2012                .filter_map(|v| v.as_str())
2013                .map(|s| crate::models::FileReference {
2014                    path: s.to_string(),
2015                    size: None,
2016                    sha1: None,
2017                    md5: None,
2018                    sha256: None,
2019                    sha512: None,
2020                    extra_data: None,
2021                })
2022                .collect::<Vec<_>>()
2023        })
2024        .unwrap_or_default();
2025
2026    let release_date = yaml_string(&yaml, "date").and_then(|d| {
2027        if d.len() >= 10 {
2028            Some(d[..10].to_string())
2029        } else {
2030            None
2031        }
2032    });
2033
2034    let purl = name
2035        .as_deref()
2036        .map(|n| create_gem_purl(n, version.as_deref()))
2037        .unwrap_or(None);
2038
2039    let platform = yaml_string(&yaml, "platform").map(truncate_field);
2040    let (repository_homepage_url, repository_download_url, api_data_url, download_url) =
2041        if let Some(n) = name.as_deref() {
2042            get_rubygems_urls(n, version.as_deref(), platform.as_deref())
2043        } else {
2044            (None, None, None, None)
2045        };
2046
2047    let qualifiers = if let Some(ref p) = platform {
2048        if p != "ruby" {
2049            let mut q = HashMap::new();
2050            q.insert("platform".to_string(), p.clone());
2051            Some(q)
2052        } else {
2053            None
2054        }
2055    } else {
2056        None
2057    };
2058
2059    Ok(PackageData {
2060        package_type: Some(PACKAGE_TYPE),
2061        name,
2062        version,
2063        qualifiers,
2064        primary_language: Some("Ruby".to_string()),
2065        description: description.or(summary),
2066        release_date,
2067        homepage_url: homepage,
2068        download_url,
2069        bug_tracking_url,
2070        code_view_url,
2071        declared_license_expression: license_expression,
2072        declared_license_expression_spdx: license_expression_spdx,
2073        license_detections,
2074        extracted_license_statement,
2075        file_references,
2076        parties,
2077        dependencies,
2078        repository_homepage_url,
2079        repository_download_url,
2080        api_data_url,
2081        datasource_id: Some(datasource_id),
2082        purl,
2083        vcs_url,
2084        ..default_package_data()
2085    })
2086}
2087
2088/// Strips Ruby-specific YAML tags that yaml_serde cannot handle.
2089fn clean_ruby_yaml_tags(content: &str) -> String {
2090    let tag_re = match Regex::new(r"!ruby/\S+") {
2091        Ok(r) => r,
2092        Err(_) => return content.to_string(),
2093    };
2094    tag_re.replace_all(content, "").to_string()
2095}
2096
2097fn yaml_string(yaml: &yaml_serde::Value, key: &str) -> Option<String> {
2098    yaml.get(key)
2099        .and_then(|v| v.as_str())
2100        .filter(|s| !s.is_empty())
2101        .map(|s| s.to_string())
2102}
2103
2104fn parse_gem_yaml_dependencies(yaml: &yaml_serde::Value) -> Vec<Dependency> {
2105    let mut dependencies = Vec::new();
2106
2107    let deps_seq = match yaml.get("dependencies").and_then(|v| v.as_sequence()) {
2108        Some(seq) => seq,
2109        None => return dependencies,
2110    };
2111
2112    for dep_value in deps_seq.iter().take(MAX_ITERATION_COUNT) {
2113        let dep_name = match yaml_string(dep_value, "name").map(truncate_field) {
2114            Some(n) => n,
2115            None => continue,
2116        };
2117
2118        let dep_type = yaml_string(dep_value, "type");
2119        let is_development = dep_type.as_deref() == Some(":development");
2120
2121        // Extract version requirements from the nested structure
2122        let requirements = dep_value
2123            .get("requirement")
2124            .or_else(|| dep_value.get("version_requirements"))
2125            .and_then(|req| req.get("requirements"))
2126            .and_then(|reqs| reqs.as_sequence());
2127
2128        let extracted_requirement = requirements.map(|reqs| {
2129            let parts: Vec<String> = reqs
2130                .iter()
2131                .filter_map(|req| {
2132                    let seq = req.as_sequence()?;
2133                    if seq.len() >= 2 {
2134                        let op = seq[0].as_str().unwrap_or("");
2135                        let ver = seq[1].get("version").and_then(|v| v.as_str()).unwrap_or("");
2136                        if op == ">=" && ver == "0" {
2137                            // ">= 0" means "any version" - skip
2138                            None
2139                        } else if op.is_empty() || ver.is_empty() {
2140                            None
2141                        } else {
2142                            Some(format!("{} {}", op, ver))
2143                        }
2144                    } else {
2145                        None
2146                    }
2147                })
2148                .collect();
2149            parts.join(", ")
2150        });
2151
2152        let extracted_requirement = extracted_requirement
2153            .filter(|s| !s.is_empty())
2154            .or_else(|| Some(String::new()));
2155
2156        let (scope, is_runtime, is_optional) = if is_development {
2157            (Some("development".to_string()), false, true)
2158        } else {
2159            (Some("runtime".to_string()), true, false)
2160        };
2161
2162        let purl = create_gem_purl(&dep_name, None);
2163
2164        dependencies.push(Dependency {
2165            purl,
2166            extracted_requirement,
2167            scope,
2168            is_runtime: Some(is_runtime),
2169            is_optional: Some(is_optional),
2170            is_pinned: None,
2171            is_direct: Some(true),
2172            resolved_package: None,
2173            extra_data: None,
2174        });
2175    }
2176
2177    dependencies
2178}
2179
2180// =============================================================================
2181// Gem Metadata Extracted Parser (metadata.gz-extract files)
2182// =============================================================================
2183
2184pub struct GemMetadataExtractedParser;
2185
2186impl PackageParser for GemMetadataExtractedParser {
2187    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
2188
2189    fn metadata() -> Vec<ParserMetadata> {
2190        vec![ParserMetadata {
2191            description: "Ruby gem metadata (extracted)",
2192            file_patterns: &["**/metadata.gz-extract"],
2193            package_type: "gem",
2194            primary_language: "Ruby",
2195            documentation_url: Some("https://guides.rubygems.org/specification-reference/"),
2196        }]
2197    }
2198
2199    fn extract_packages(path: &Path) -> Vec<PackageData> {
2200        vec![match extract_gem_metadata_extracted(path) {
2201            Ok(data) => data,
2202            Err(e) => {
2203                warn!("Failed to extract gem metadata from {:?}: {}", path, e);
2204                default_package_data_with_datasource(DatasourceId::GemArchiveExtracted)
2205            }
2206        }]
2207    }
2208
2209    fn is_match(path: &Path) -> bool {
2210        path.to_str()
2211            .is_some_and(|p| p.contains("metadata.gz-extract"))
2212    }
2213}
2214
2215fn extract_gem_metadata_extracted(path: &Path) -> Result<PackageData, String> {
2216    let content = read_file_to_string(path, None)
2217        .map_err(|e| format!("Failed to read metadata.gz-extract file: {}", e))?;
2218
2219    parse_gem_metadata_yaml(&content, DatasourceId::GemArchiveExtracted)
2220}
2221
2222#[cfg(test)]
2223mod tests {
2224    use super::parse_gemspec;
2225
2226    #[test]
2227    fn test_clean_gemspec_value_handles_unterminated_percent_q() {
2228        assert_eq!(
2229            super::clean_gemspec_value("%q{Arel is a SQL AST manager for Ruby. It"),
2230            "Arel is a SQL AST manager for Ruby. It"
2231        );
2232    }
2233
2234    #[test]
2235    fn test_parse_gemspec_runtime_dependency_scope() {
2236        let content = r#"
2237Gem::Specification.new do |spec|
2238  spec.name = "demo"
2239  spec.version = "1.0.0"
2240  spec.add_runtime_dependency "rack", "~> 3.0"
2241  spec.add_dependency "thor", ">= 1.0"
2242end
2243"#;
2244
2245        let package_data = parse_gemspec(content);
2246        assert_eq!(package_data.dependencies.len(), 2);
2247        assert_eq!(
2248            package_data.dependencies[0].scope,
2249            Some("runtime".to_string())
2250        );
2251        assert_eq!(
2252            package_data.dependencies[0].extracted_requirement,
2253            Some("~> 3.0".to_string())
2254        );
2255        assert_eq!(
2256            package_data.dependencies[1].scope,
2257            Some("runtime".to_string())
2258        );
2259        assert_eq!(
2260            package_data.dependencies[1].extracted_requirement,
2261            Some(">= 1.0".to_string())
2262        );
2263    }
2264}