Skip to main content

provenant/parsers/
ruby.rs

1//! Parser for Ruby/RubyGems package manifests.
2//!
3//! Extracts package metadata, dependencies, and platform information from
4//! Gemfile and Gemfile.lock files used by Ruby/Bundler projects.
5//!
6//! # Supported Formats
7//! - Gemfile (manifest with Ruby DSL)
8//! - Gemfile.lock (lockfile with state machine sections)
9//! - *.gemspec (gem specification files)
10//! - *.gem (gem archive packages)
11//! - metadata.gz-extract (pre-extracted gem metadata)
12//!
13//! # Key Features
14//! - State machine parsing for Gemfile.lock sections (GEM, GIT, PATH, SVN, PLATFORMS, BUNDLED WITH, DEPENDENCIES)
15//! - Regex-based Ruby DSL parsing for Gemfile
16//! - Dependency group handling (:development, :test, etc.)
17//! - Platform-specific gem support
18//! - Pessimistic version operator (~>) support
19//! - Bug Fix #1: Strip .freeze suffix from strings
20//! - Bug Fix #4: Correct dependency scope mapping (:runtime → None, :development → "development")
21//!
22//! # Implementation Notes
23//! - Uses regex for pattern matching (not full Ruby AST)
24//! - Graceful error handling: logs warnings and returns default on parse failure
25//! - PURL type: "gem"
26
27use crate::models::{DatasourceId, Dependency, PackageData, PackageType, Party};
28use crate::parser_warn as warn;
29use crate::parsers::utils::split_name_email;
30use flate2::read::GzDecoder;
31use packageurl::PackageUrl;
32use regex::Regex;
33use std::collections::HashMap;
34use std::fs::{self, File};
35use std::io::Read;
36use std::path::{Path, PathBuf};
37use tar::Archive;
38
39use super::PackageParser;
40use super::license_normalization::normalize_spdx_declared_license;
41
42const PACKAGE_TYPE: PackageType = PackageType::Gem;
43
44// =============================================================================
45// Bug Fix #1: Strip .freeze suffix from strings
46// =============================================================================
47
48/// Strips the `.freeze` suffix from Ruby frozen string literals.
49///
50/// In Ruby, `.freeze` makes a string immutable. We need to remove this suffix
51/// when parsing gem names and versions from Gemfile.
52///
53/// # Examples
54/// ```ignore
55/// assert_eq!(strip_freeze_suffix("\"name\".freeze"), "\"name\"");
56/// assert_eq!(strip_freeze_suffix("'1.0.0'.freeze"), "'1.0.0'");
57/// ```
58pub fn strip_freeze_suffix(s: &str) -> &str {
59    s.trim_end_matches(".freeze")
60}
61
62enum GemfileBlock {
63    Group(Vec<String>),
64    Source(String),
65}
66
67// =============================================================================
68// Gemfile Parser (Ruby DSL)
69// =============================================================================
70
71/// Ruby Gemfile parser for manifest files.
72///
73/// Parses Ruby DSL syntax to extract gem declarations, dependency groups,
74/// platform-specific gems, and version constraints.
75pub struct GemfileParser;
76
77impl PackageParser for GemfileParser {
78    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
79
80    fn extract_packages(path: &Path) -> Vec<PackageData> {
81        let content = match fs::read_to_string(path) {
82            Ok(c) => c,
83            Err(e) => {
84                warn!("Failed to read Gemfile at {:?}: {}", path, e);
85                return vec![default_package_data_with_datasource(DatasourceId::Gemfile)];
86            }
87        };
88
89        vec![parse_gemfile(&content)]
90    }
91
92    fn is_match(path: &Path) -> bool {
93        path.file_name()
94            .and_then(|n| n.to_str())
95            .is_some_and(|name| name == "Gemfile")
96            || path
97                .to_str()
98                .is_some_and(|p| p.contains("data.gz-extract/") && p.ends_with("/Gemfile"))
99    }
100}
101
102/// Parses Gemfile content and extracts dependencies with groups.
103fn parse_gemfile(content: &str) -> PackageData {
104    let mut dependencies = Vec::new();
105    let mut block_stack = Vec::new();
106    let mut default_source = None;
107    let mut sources = Vec::new();
108
109    // Regex patterns for Gemfile parsing
110    // gem "name", "version", options...
111    let gem_regex = match Regex::new(
112        r#"^\s*gem\s+["']([^"']+)["'](?:\.freeze)?(?:\s*,\s*["']([^"']+)["'](?:\.freeze)?)?(?:\s*,\s*["']([^"']+)["'](?:\.freeze)?)?(?:\s*,\s*(.+))?"#,
113    ) {
114        Ok(r) => r,
115        Err(e) => {
116            warn!("Failed to compile gem regex: {}", e);
117            return default_package_data_with_datasource(DatasourceId::Gemfile);
118        }
119    };
120
121    // group :name do ... end
122    let group_start_regex = match Regex::new(r"^\s*group\s+(.+?)\s+do\s*$") {
123        Ok(r) => r,
124        Err(e) => {
125            warn!("Failed to compile group regex: {}", e);
126            return default_package_data_with_datasource(DatasourceId::Gemfile);
127        }
128    };
129
130    let group_end_regex = match Regex::new(r"^\s*end\s*$") {
131        Ok(r) => r,
132        Err(e) => {
133            warn!("Failed to compile end regex: {}", e);
134            return default_package_data_with_datasource(DatasourceId::Gemfile);
135        }
136    };
137
138    let source_block_start_regex = match Regex::new(r#"^\s*source\s+["']([^"']+)["']\s+do\s*$"#) {
139        Ok(r) => r,
140        Err(e) => {
141            warn!("Failed to compile source block regex: {}", e);
142            return default_package_data_with_datasource(DatasourceId::Gemfile);
143        }
144    };
145
146    let source_regex = match Regex::new(r#"^\s*source\s+["']([^"']+)["']\s*$"#) {
147        Ok(r) => r,
148        Err(e) => {
149            warn!("Failed to compile source regex: {}", e);
150            return default_package_data_with_datasource(DatasourceId::Gemfile);
151        }
152    };
153
154    // Parse symbols like :development, :test
155    let symbol_regex = match Regex::new(r":(\w+)") {
156        Ok(r) => r,
157        Err(e) => {
158            warn!("Failed to compile symbol regex: {}", e);
159            return default_package_data_with_datasource(DatasourceId::Gemfile);
160        }
161    };
162
163    for line in content.lines() {
164        let trimmed = line.trim();
165
166        // Skip comments and empty lines
167        if trimmed.is_empty() || trimmed.starts_with('#') {
168            continue;
169        }
170
171        // Check for group start
172        if let Some(caps) = group_start_regex.captures(trimmed) {
173            let groups_str = caps.get(1).map(|m| m.as_str()).unwrap_or("");
174            let mut current_groups = Vec::new();
175            for cap in symbol_regex.captures_iter(groups_str) {
176                if let Some(group_name) = cap.get(1) {
177                    current_groups.push(group_name.as_str().to_string());
178                }
179            }
180            block_stack.push(GemfileBlock::Group(current_groups));
181            continue;
182        }
183
184        if let Some(caps) = source_block_start_regex.captures(trimmed) {
185            let source = caps
186                .get(1)
187                .map(|m| m.as_str().to_string())
188                .unwrap_or_default();
189            if !source.is_empty() {
190                push_unique_string(&mut sources, source.clone());
191                block_stack.push(GemfileBlock::Source(source));
192            }
193            continue;
194        }
195
196        if let Some(caps) = source_regex.captures(trimmed) {
197            if let Some(source) = caps.get(1).map(|m| m.as_str().to_string()) {
198                push_unique_string(&mut sources, source.clone());
199                default_source = Some(source);
200            }
201            continue;
202        }
203
204        // Check for group end
205        if group_end_regex.is_match(trimmed) {
206            block_stack.pop();
207            continue;
208        }
209
210        // Parse gem declaration
211        if let Some(caps) = gem_regex.captures(trimmed) {
212            let name = strip_freeze_suffix(caps.get(1).map(|m| m.as_str()).unwrap_or(""));
213            if name.is_empty() {
214                continue;
215            }
216
217            // Collect version constraints
218            let mut version_parts = Vec::new();
219            if let Some(v) = caps.get(2) {
220                version_parts.push(strip_freeze_suffix(v.as_str()).to_string());
221            }
222            if let Some(v) = caps.get(3) {
223                let v_str = strip_freeze_suffix(v.as_str());
224                // Check if it looks like a version constraint
225                if looks_like_version_constraint(v_str) {
226                    version_parts.push(v_str.to_string());
227                }
228            }
229
230            let extracted_requirement = if version_parts.is_empty() {
231                None
232            } else {
233                Some(version_parts.join(", "))
234            };
235
236            let current_groups = current_group_names(&block_stack);
237
238            // Determine scope based on current group
239            // Bug Fix #4: :runtime → None, :development → "development"
240            let (scope, is_runtime, is_optional) = if current_groups.is_empty() {
241                // No group = runtime dependency
242                (None, true, false)
243            } else if current_groups.iter().any(|g| g == "development") {
244                (Some("development".to_string()), false, true)
245            } else if current_groups.iter().any(|g| g == "test") {
246                (Some("test".to_string()), false, true)
247            } else {
248                // Other groups (e.g., :production)
249                let group = current_groups.first().cloned();
250                (group, true, false)
251            };
252
253            // Create PURL
254            let purl = create_gem_purl(name, None);
255            let inherited_source = current_source(&block_stack, default_source.as_deref());
256            let extra_data = build_gemfile_dependency_extra_data(
257                caps.get(4).map(|m| m.as_str()),
258                inherited_source.as_deref(),
259            );
260
261            dependencies.push(Dependency {
262                purl,
263                extracted_requirement,
264                scope,
265                is_runtime: Some(is_runtime),
266                is_optional: Some(is_optional),
267                is_pinned: None,
268                is_direct: Some(true),
269                resolved_package: None,
270                extra_data,
271            });
272        }
273    }
274
275    let extra_data = if sources.is_empty() {
276        None
277    } else {
278        Some(HashMap::from([(
279            "sources".to_string(),
280            serde_json::Value::Array(sources.into_iter().map(serde_json::Value::String).collect()),
281        )]))
282    };
283
284    PackageData {
285        package_type: Some(PACKAGE_TYPE),
286        primary_language: Some("Ruby".to_string()),
287        dependencies,
288        extra_data,
289        datasource_id: Some(DatasourceId::Gemfile),
290        ..default_package_data()
291    }
292}
293
294fn current_group_names(block_stack: &[GemfileBlock]) -> Vec<String> {
295    block_stack
296        .iter()
297        .rev()
298        .find_map(|block| match block {
299            GemfileBlock::Group(groups) => Some(groups.clone()),
300            GemfileBlock::Source(_) => None,
301        })
302        .unwrap_or_default()
303}
304
305fn current_source(block_stack: &[GemfileBlock], default_source: Option<&str>) -> Option<String> {
306    block_stack
307        .iter()
308        .rev()
309        .find_map(|block| match block {
310            GemfileBlock::Source(source) => Some(source.clone()),
311            GemfileBlock::Group(_) => None,
312        })
313        .or_else(|| default_source.map(str::to_string))
314}
315
316fn push_unique_string(values: &mut Vec<String>, value: String) {
317    if !values.contains(&value) {
318        values.push(value);
319    }
320}
321
322fn build_gemfile_dependency_extra_data(
323    options: Option<&str>,
324    inherited_source: Option<&str>,
325) -> Option<HashMap<String, serde_json::Value>> {
326    let mut extra = HashMap::new();
327    let options = options.unwrap_or("");
328
329    if let Some(git) = extract_gemfile_quoted_option(options, "git") {
330        extra.insert(
331            "source_type".to_string(),
332            serde_json::Value::String("GIT".to_string()),
333        );
334        extra.insert("git".to_string(), serde_json::Value::String(git.clone()));
335        extra.insert("remote".to_string(), serde_json::Value::String(git));
336    }
337
338    if let Some(path) = extract_gemfile_quoted_option(options, "path") {
339        extra.insert(
340            "source_type".to_string(),
341            serde_json::Value::String("PATH".to_string()),
342        );
343        extra.insert("path".to_string(), serde_json::Value::String(path));
344    }
345
346    for key in ["branch", "ref", "tag"] {
347        if let Some(value) = extract_gemfile_quoted_option(options, key) {
348            extra.insert(key.to_string(), serde_json::Value::String(value));
349        }
350    }
351
352    let direct_source = extract_gemfile_quoted_option(options, "source");
353    if let Some(source) = direct_source {
354        extra.insert("source".to_string(), serde_json::Value::String(source));
355    } else if !extra.contains_key("source_type")
356        && let Some(source) = inherited_source
357    {
358        extra.insert(
359            "source".to_string(),
360            serde_json::Value::String(source.to_string()),
361        );
362    }
363
364    (!extra.is_empty()).then_some(extra)
365}
366
367fn extract_gemfile_quoted_option(options: &str, key: &str) -> Option<String> {
368    if options.is_empty() {
369        return None;
370    }
371
372    let pattern = format!(r#"(?:^|,\s*){}\s*:\s*["']([^"']+)["']"#, regex::escape(key));
373    Regex::new(&pattern)
374        .ok()
375        .and_then(|regex| regex.captures(options))
376        .and_then(|captures| captures.get(1).map(|m| m.as_str().to_string()))
377}
378
379/// Checks if a string looks like a version constraint.
380fn looks_like_version_constraint(s: &str) -> bool {
381    s.starts_with('~')
382        || s.starts_with('>')
383        || s.starts_with('<')
384        || s.starts_with('=')
385        || s.starts_with('!')
386        || s.chars().next().is_some_and(|c| c.is_ascii_digit())
387}
388
389// =============================================================================
390// Gemfile.lock Parser (State Machine)
391// =============================================================================
392
393/// Ruby Gemfile.lock parser for lockfiles.
394///
395/// Uses a state machine to parse sections: GEM, GIT, PATH, SVN,
396/// PLATFORMS, BUNDLED WITH, DEPENDENCIES.
397pub struct GemfileLockParser;
398
399impl PackageParser for GemfileLockParser {
400    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
401
402    fn extract_packages(path: &Path) -> Vec<PackageData> {
403        let content = match fs::read_to_string(path) {
404            Ok(c) => c,
405            Err(e) => {
406                warn!("Failed to read Gemfile.lock at {:?}: {}", path, e);
407                return vec![default_package_data_with_datasource(
408                    DatasourceId::GemfileLock,
409                )];
410            }
411        };
412
413        vec![parse_gemfile_lock(&content)]
414    }
415
416    fn is_match(path: &Path) -> bool {
417        path.file_name()
418            .and_then(|n| n.to_str())
419            .is_some_and(|name| name == "Gemfile.lock")
420            || path
421                .to_str()
422                .is_some_and(|p| p.contains("data.gz-extract/") && p.ends_with("/Gemfile.lock"))
423    }
424}
425
426/// Parse state for Gemfile.lock state machine.
427#[derive(Debug, Clone, PartialEq)]
428enum ParseState {
429    None,
430    Gem,
431    Git,
432    Path,
433    Svn,
434    Specs,
435    Platforms,
436    BundledWith,
437    Dependencies,
438}
439
440/// Parsed gem information from Gemfile.lock.
441///
442/// All fields are actively used:
443/// - `gem_type`, `remote`, `revision`, `ref_field`, `branch`, `tag`: Stored in extra_data for GIT/PATH/SVN sources
444/// - `name`, `version`, `platform`, `pinned`: Used for dependency PURL and metadata generation
445/// - `requirements`: Stored as extracted_requirement for version constraints
446#[derive(Debug, Clone, Default)]
447struct GemInfo {
448    name: String,
449    version: Option<String>,
450    platform: Option<String>,
451    gem_type: String,
452    remote: Option<String>,
453    revision: Option<String>,
454    ref_field: Option<String>,
455    branch: Option<String>,
456    tag: Option<String>,
457    pinned: bool,
458    requirements: Vec<String>,
459}
460
461/// Parses Gemfile.lock content using a state machine.
462fn parse_gemfile_lock(content: &str) -> PackageData {
463    let mut state = ParseState::None;
464    let mut dependencies = Vec::new();
465    let mut gems: HashMap<String, GemInfo> = HashMap::new();
466    let mut platforms: Vec<String> = Vec::new();
467    let mut bundler_version: Option<String> = None;
468    let mut current_gem_type = String::new();
469    let mut current_remote: Option<String> = None;
470    let mut current_options: HashMap<String, String> = HashMap::new();
471
472    // DEPS pattern: 2 spaces at line start
473    let deps_regex = match Regex::new(r"^ {2}([^ \)\(,!:]+)(?: \(([^)]+)\))?(!)?$") {
474        Ok(r) => r,
475        Err(e) => {
476            warn!("Failed to compile deps regex: {}", e);
477            return default_package_data_with_datasource(DatasourceId::GemfileLock);
478        }
479    };
480
481    // SPEC_DEPS pattern: 4 spaces at line start
482    let spec_deps_regex = match Regex::new(r"^ {4}([^ \)\(,!:]+)(?: \(([^)]+)\))?$") {
483        Ok(r) => r,
484        Err(e) => {
485            warn!("Failed to compile spec_deps regex: {}", e);
486            return default_package_data_with_datasource(DatasourceId::GemfileLock);
487        }
488    };
489
490    // OPTIONS pattern: key: value
491    let options_regex = match Regex::new(r"^ {2}([a-z]+): (.+)$") {
492        Ok(r) => r,
493        Err(e) => {
494            warn!("Failed to compile options regex: {}", e);
495            return default_package_data_with_datasource(DatasourceId::GemfileLock);
496        }
497    };
498
499    // VERSION pattern for BUNDLED WITH
500    let version_regex = match Regex::new(r"^\s+(\d+(?:\.\d+)+)\s*$") {
501        Ok(r) => r,
502        Err(e) => {
503            warn!("Failed to compile version regex: {}", e);
504            return default_package_data_with_datasource(DatasourceId::GemfileLock);
505        }
506    };
507
508    for line in content.lines() {
509        let trimmed = line.trim_end();
510
511        // Empty line resets state
512        if trimmed.is_empty() {
513            current_options.clear();
514            continue;
515        }
516
517        // Section headers (no leading whitespace) and sub-section headers
518        match trimmed {
519            "GEM" => {
520                state = ParseState::Gem;
521                current_gem_type = "GEM".to_string();
522                current_remote = None;
523                current_options.clear();
524                continue;
525            }
526            "GIT" => {
527                state = ParseState::Git;
528                current_gem_type = "GIT".to_string();
529                current_remote = None;
530                current_options.clear();
531                continue;
532            }
533            "PATH" => {
534                state = ParseState::Path;
535                current_gem_type = "PATH".to_string();
536                current_remote = None;
537                current_options.clear();
538                continue;
539            }
540            "SVN" => {
541                state = ParseState::Svn;
542                current_gem_type = "SVN".to_string();
543                current_remote = None;
544                current_options.clear();
545                continue;
546            }
547            "PLATFORMS" => {
548                state = ParseState::Platforms;
549                continue;
550            }
551            "BUNDLED WITH" => {
552                state = ParseState::BundledWith;
553                continue;
554            }
555            "DEPENDENCIES" => {
556                state = ParseState::Dependencies;
557                continue;
558            }
559            _ => {}
560        }
561
562        // Check for "  specs:" sub-section header (2-space indent) within
563        // GEM/GIT/PATH/SVN sections. This must be checked separately because
564        // the leading whitespace is preserved by trim_end().
565        if trimmed.trim() == "specs:" {
566            state = match state {
567                ParseState::Gem | ParseState::Git | ParseState::Path | ParseState::Svn => {
568                    ParseState::Specs
569                }
570                _ => state,
571            };
572            continue;
573        }
574
575        // Process based on current state
576        match state {
577            ParseState::Gem | ParseState::Git | ParseState::Path | ParseState::Svn => {
578                // Parse options (remote:, revision:, ref:, branch:, tag:)
579                if let Some(caps) = options_regex.captures(line) {
580                    let key = caps.get(1).map(|m| m.as_str()).unwrap_or("");
581                    let value = caps.get(2).map(|m| m.as_str()).unwrap_or("");
582                    current_options.insert(key.to_string(), value.to_string());
583                    if key == "remote" {
584                        current_remote = Some(value.to_string());
585                    }
586                }
587            }
588            ParseState::Specs => {
589                // Parse gem specs (4 spaces indent)
590                if let Some(caps) = spec_deps_regex.captures(line) {
591                    let name = caps.get(1).map(|m| m.as_str()).unwrap_or("").to_string();
592                    let version_str = caps.get(2).map(|m| m.as_str()).unwrap_or("");
593
594                    // Parse version and platform
595                    let (version, platform) = parse_version_platform(version_str);
596
597                    if !name.is_empty() {
598                        let gem_info = GemInfo {
599                            name: name.clone(),
600                            version,
601                            platform,
602                            gem_type: current_gem_type.clone(),
603                            remote: current_remote.clone(),
604                            revision: current_options.get("revision").cloned(),
605                            ref_field: current_options.get("ref").cloned(),
606                            branch: current_options.get("branch").cloned(),
607                            tag: current_options.get("tag").cloned(),
608                            pinned: false,
609                            requirements: Vec::new(),
610                        };
611                        gems.insert(name, gem_info);
612                    }
613                }
614            }
615            ParseState::Platforms => {
616                // Parse platform entries (2 spaces indent)
617                let platform = trimmed.trim();
618                if !platform.is_empty() {
619                    platforms.push(platform.to_string());
620                }
621            }
622            ParseState::BundledWith => {
623                // Parse bundler version
624                if let Some(caps) = version_regex.captures(line) {
625                    bundler_version = caps.get(1).map(|m| m.as_str().to_string());
626                }
627            }
628            ParseState::Dependencies => {
629                // Parse direct dependencies (2 spaces indent)
630                if let Some(caps) = deps_regex.captures(line) {
631                    let name = caps.get(1).map(|m| m.as_str()).unwrap_or("").to_string();
632                    let version_constraint = caps.get(2).map(|m| m.as_str().to_string());
633                    let pinned = caps.get(3).is_some();
634
635                    if !name.is_empty() {
636                        // Update gem info if exists, or create new
637                        if let Some(gem) = gems.get_mut(&name) {
638                            gem.pinned = pinned;
639                            if let Some(vc) = &version_constraint {
640                                gem.requirements.push(vc.clone());
641                            }
642                        } else {
643                            let gem_info = GemInfo {
644                                name: name.clone(),
645                                version: None,
646                                platform: None,
647                                gem_type: "GEM".to_string(),
648                                remote: None,
649                                revision: None,
650                                ref_field: None,
651                                branch: None,
652                                tag: None,
653                                pinned,
654                                requirements: version_constraint.into_iter().collect(),
655                            };
656                            gems.insert(name, gem_info);
657                        }
658                    }
659                }
660            }
661            ParseState::None => {}
662        }
663    }
664
665    let primary_gem = gems.values().find(|gem| gem.gem_type == "PATH").cloned();
666
667    let (
668        package_name,
669        package_version,
670        repository_homepage_url,
671        repository_download_url,
672        api_data_url,
673        download_url,
674    ) = if let Some(ref pg) = primary_gem {
675        let urls = get_rubygems_urls(&pg.name, pg.version.as_deref(), pg.platform.as_deref());
676        (
677            Some(pg.name.clone()),
678            pg.version.clone(),
679            urls.0,
680            urls.1,
681            urls.2,
682            urls.3,
683        )
684    } else {
685        (None, None, None, None, None, None)
686    };
687
688    for (_, gem) in gems {
689        if let Some(ref pg) = primary_gem
690            && gem.name == pg.name
691        {
692            continue;
693        }
694
695        let version_for_purl = gem.version.as_deref();
696        let purl = create_gem_purl(&gem.name, version_for_purl);
697
698        let extracted_requirement = if !gem.requirements.is_empty() {
699            Some(gem.requirements.join(", "))
700        } else {
701            gem.version.clone()
702        };
703
704        let extra_data = build_gem_source_extra_data(&gem);
705
706        dependencies.push(Dependency {
707            purl,
708            extracted_requirement,
709            scope: Some("dependencies".to_string()),
710            is_runtime: Some(true),
711            is_optional: Some(false),
712            is_pinned: Some(gem.pinned),
713            is_direct: Some(true),
714            resolved_package: None,
715            extra_data,
716        });
717    }
718
719    dependencies.sort_by(|left, right| {
720        left.purl
721            .as_deref()
722            .cmp(&right.purl.as_deref())
723            .then_with(|| {
724                left.extracted_requirement
725                    .as_deref()
726                    .cmp(&right.extracted_requirement.as_deref())
727            })
728    });
729
730    // Build extra_data
731    let mut extra_data = HashMap::new();
732    if !platforms.is_empty() {
733        extra_data.insert(
734            "platforms".to_string(),
735            serde_json::Value::Array(
736                platforms
737                    .into_iter()
738                    .map(serde_json::Value::String)
739                    .collect(),
740            ),
741        );
742    }
743    if let Some(bv) = bundler_version {
744        extra_data.insert("bundler_version".to_string(), serde_json::Value::String(bv));
745    }
746
747    let purl = package_name
748        .as_deref()
749        .map(|n| create_gem_purl(n, package_version.as_deref()))
750        .unwrap_or(None);
751
752    PackageData {
753        package_type: Some(PACKAGE_TYPE),
754        name: package_name,
755        version: package_version,
756        primary_language: Some("Ruby".to_string()),
757        download_url,
758        dependencies,
759        repository_homepage_url,
760        repository_download_url,
761        api_data_url,
762        extra_data: if extra_data.is_empty() {
763            None
764        } else {
765            Some(extra_data)
766        },
767        datasource_id: Some(DatasourceId::GemfileLock),
768        purl,
769        ..default_package_data()
770    }
771}
772
773fn build_gem_source_extra_data(gem: &GemInfo) -> Option<HashMap<String, serde_json::Value>> {
774    if gem.gem_type != "GIT" && gem.gem_type != "PATH" && gem.gem_type != "SVN" {
775        return None;
776    }
777
778    let mut extra = HashMap::new();
779    extra.insert(
780        "source_type".to_string(),
781        serde_json::Value::String(gem.gem_type.clone()),
782    );
783
784    if let Some(ref remote) = gem.remote {
785        extra.insert(
786            "remote".to_string(),
787            serde_json::Value::String(remote.clone()),
788        );
789    }
790    if let Some(ref revision) = gem.revision {
791        extra.insert(
792            "revision".to_string(),
793            serde_json::Value::String(revision.clone()),
794        );
795    }
796    if let Some(ref ref_field) = gem.ref_field {
797        extra.insert(
798            "ref".to_string(),
799            serde_json::Value::String(ref_field.clone()),
800        );
801    }
802    if let Some(ref branch) = gem.branch {
803        extra.insert(
804            "branch".to_string(),
805            serde_json::Value::String(branch.clone()),
806        );
807    }
808    if let Some(ref tag) = gem.tag {
809        extra.insert("tag".to_string(), serde_json::Value::String(tag.clone()));
810    }
811
812    Some(extra)
813}
814
815/// Parses version and platform from a combined string.
816/// Examples: "2.6.3" -> ("2.6.3", None), "2.6.3-java" -> ("2.6.3", Some("java"))
817fn parse_version_platform(s: &str) -> (Option<String>, Option<String>) {
818    if s.is_empty() {
819        return (None, None);
820    }
821    if let Some(idx) = s.find('-') {
822        let version = &s[..idx];
823        let platform = &s[idx + 1..];
824        (Some(version.to_string()), Some(platform.to_string()))
825    } else {
826        (Some(s.to_string()), None)
827    }
828}
829
830/// Creates a gem PURL.
831fn create_gem_purl(name: &str, version: Option<&str>) -> Option<String> {
832    let mut purl = match PackageUrl::new(PACKAGE_TYPE.as_str(), name) {
833        Ok(p) => p,
834        Err(e) => {
835            warn!("Failed to create PURL for gem '{}': {}", name, e);
836            return None;
837        }
838    };
839
840    if let Some(v) = version
841        && let Err(e) = purl.with_version(v)
842    {
843        warn!("Failed to set version '{}' for gem '{}': {}", v, name, e);
844    }
845
846    Some(purl.to_string())
847}
848
849fn rubygems_homepage_url(name: &str, version: Option<&str>) -> Option<String> {
850    if name.is_empty() {
851        return None;
852    }
853
854    if let Some(v) = version {
855        let v = v.trim().trim_matches('/');
856        Some(format!("https://rubygems.org/gems/{}/versions/{}", name, v))
857    } else {
858        Some(format!("https://rubygems.org/gems/{}", name))
859    }
860}
861
862fn rubygems_download_url(
863    name: &str,
864    version: Option<&str>,
865    platform: Option<&str>,
866) -> Option<String> {
867    if name.is_empty() || version.is_none() {
868        return None;
869    }
870
871    let name = name.trim().trim_matches('/');
872    let version = version?.trim().trim_matches('/');
873
874    let version_plat = if let Some(p) = platform {
875        if p != "ruby" {
876            format!("{}-{}", version, p)
877        } else {
878            version.to_string()
879        }
880    } else {
881        version.to_string()
882    };
883
884    Some(format!(
885        "https://rubygems.org/downloads/{}-{}.gem",
886        name, version_plat
887    ))
888}
889
890fn rubygems_api_url(name: &str, version: Option<&str>) -> Option<String> {
891    if name.is_empty() {
892        return None;
893    }
894
895    if let Some(v) = version {
896        Some(format!(
897            "https://rubygems.org/api/v2/rubygems/{}/versions/{}.json",
898            name, v
899        ))
900    } else {
901        Some(format!(
902            "https://rubygems.org/api/v1/versions/{}.json",
903            name
904        ))
905    }
906}
907
908fn get_rubygems_urls(
909    name: &str,
910    version: Option<&str>,
911    platform: Option<&str>,
912) -> (
913    Option<String>,
914    Option<String>,
915    Option<String>,
916    Option<String>,
917) {
918    let repository_homepage_url = rubygems_homepage_url(name, version);
919    let repository_download_url = rubygems_download_url(name, version, platform);
920    let api_data_url = rubygems_api_url(name, version);
921    let download_url = repository_download_url.clone();
922
923    (
924        repository_homepage_url,
925        repository_download_url,
926        api_data_url,
927        download_url,
928    )
929}
930
931/// Returns a default PackageData with gem-specific settings.
932fn default_package_data() -> PackageData {
933    PackageData {
934        package_type: Some(PACKAGE_TYPE),
935        primary_language: Some("Ruby".to_string()),
936        ..Default::default()
937    }
938}
939
940fn default_package_data_with_datasource(datasource_id: DatasourceId) -> PackageData {
941    PackageData {
942        datasource_id: Some(datasource_id),
943        ..default_package_data()
944    }
945}
946
947// =============================================================================
948// Gemspec Parser (Ruby DSL)
949// =============================================================================
950
951/// Ruby .gemspec file parser.
952///
953/// Parses `Gem::Specification.new` blocks using regex-based extraction.
954/// Handles frozen strings (Bug #1), variable version resolution (Bug #2),
955/// and RFC 5322 email parsing (Bug #6).
956pub struct GemspecParser;
957
958impl PackageParser for GemspecParser {
959    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
960
961    fn extract_packages(path: &Path) -> Vec<PackageData> {
962        let content = match fs::read_to_string(path) {
963            Ok(c) => c,
964            Err(e) => {
965                warn!("Failed to read .gemspec at {:?}: {}", path, e);
966                return vec![default_package_data_with_datasource(DatasourceId::Gemspec)];
967            }
968        };
969
970        vec![parse_gemspec_with_context(&content, path.parent())]
971    }
972
973    fn is_match(path: &Path) -> bool {
974        path.extension()
975            .and_then(|ext| ext.to_str())
976            .is_some_and(|ext| ext == "gemspec")
977    }
978}
979
980/// Cleans a value extracted from gemspec by stripping quotes, .freeze, %q{}, and brackets.
981fn clean_gemspec_value(s: &str) -> String {
982    let s = strip_freeze_suffix(s).trim();
983
984    let s = if let Some(pos) = s.find(" #") {
985        s[..pos].trim()
986    } else {
987        s
988    };
989
990    let s = if let Some(stripped) = s.strip_prefix("%q{") {
991        stripped.strip_suffix('}').unwrap_or(stripped)
992    } else if let Some(stripped) = s.strip_prefix("%q<") {
993        stripped.strip_suffix('>').unwrap_or(stripped)
994    } else if let Some(stripped) = s.strip_prefix("%q[") {
995        stripped.strip_suffix(']').unwrap_or(stripped)
996    } else if let Some(stripped) = s.strip_prefix("%q(") {
997        stripped.strip_suffix(')').unwrap_or(stripped)
998    } else {
999        s
1000    };
1001
1002    let s = s
1003        .trim_start_matches('"')
1004        .trim_end_matches('"')
1005        .trim_start_matches('\'')
1006        .trim_end_matches('\'');
1007    let s = strip_freeze_suffix(s).trim();
1008    s.to_string()
1009}
1010
1011/// Extracts items from a Ruby array literal like `["a", "b", "c"]`.
1012fn extract_ruby_array(s: &str) -> Vec<String> {
1013    let s = strip_freeze_suffix(s.trim());
1014    let s = s.trim_start_matches('[').trim_end_matches(']');
1015    let item_re = match Regex::new(r#"["']([^"']*?)["'](?:\.freeze)?"#) {
1016        Ok(r) => r,
1017        Err(_) => return Vec::new(),
1018    };
1019    item_re
1020        .captures_iter(s)
1021        .filter_map(|cap| cap.get(1).map(|m| m.as_str().to_string()))
1022        .collect()
1023}
1024
1025fn extract_all_ruby_values(s: &str) -> Vec<String> {
1026    let value_re = match Regex::new(r#"%q[\{<\[(]([^\}>\])]+)[\}>\])]|["']([^"']+)["']"#) {
1027        Ok(r) => r,
1028        Err(_) => return Vec::new(),
1029    };
1030
1031    value_re
1032        .captures_iter(s)
1033        .filter_map(|caps| caps.get(1).or_else(|| caps.get(2)))
1034        .map(|m| clean_gemspec_value(m.as_str()))
1035        .collect()
1036}
1037
1038fn extract_first_ruby_value(s: &str) -> Option<String> {
1039    extract_all_ruby_values(s).into_iter().next()
1040}
1041
1042fn after_first_argument(args: &str) -> &str {
1043    let mut bracket_depth = 0usize;
1044    let mut paren_depth = 0usize;
1045    let mut in_quote: Option<char> = None;
1046    let chars: Vec<(usize, char)> = args.char_indices().collect();
1047    let mut i = 0;
1048
1049    while i < chars.len() {
1050        let (idx, ch) = chars[i];
1051
1052        if let Some(quote) = in_quote {
1053            if ch == '\\' {
1054                i += 2;
1055                continue;
1056            }
1057            if ch == quote {
1058                in_quote = None;
1059            }
1060            i += 1;
1061            continue;
1062        }
1063
1064        match ch {
1065            '\'' | '"' => in_quote = Some(ch),
1066            '[' | '{' | '<' => bracket_depth += 1,
1067            ']' | '}' | '>' => bracket_depth = bracket_depth.saturating_sub(1),
1068            '(' => paren_depth += 1,
1069            ')' => paren_depth = paren_depth.saturating_sub(1),
1070            ',' if bracket_depth == 0 && paren_depth == 0 => return args[idx + 1..].trim(),
1071            _ => {}
1072        }
1073
1074        i += 1;
1075    }
1076
1077    ""
1078}
1079
1080/// Bug #2: Resolves variable version references like `CSV::VERSION` or `RAILS_VERSION`.
1081///
1082/// Scans the file content for constant definitions matching the variable name
1083/// and returns the resolved string value.
1084fn resolve_variable_version(var_name: &str, contexts: &[String]) -> Option<String> {
1085    let var_name = var_name.trim();
1086    if var_name.is_empty() {
1087        return None;
1088    }
1089
1090    for candidate in candidate_constant_names(var_name) {
1091        let escaped = regex::escape(&candidate);
1092        let pattern = format!(r#"(?m)^\s*{}\s*=\s*["']([^"']+)["']"#, escaped);
1093        let Ok(re) = Regex::new(&pattern) else {
1094            continue;
1095        };
1096
1097        for context in contexts {
1098            if let Some(caps) = re.captures(context) {
1099                return caps.get(1).map(|m| m.as_str().to_string());
1100            }
1101        }
1102    }
1103
1104    None
1105}
1106
1107fn resolve_variable_array(var_name: &str, contexts: &[String]) -> Option<Vec<String>> {
1108    let var_name = var_name.trim();
1109    if var_name.is_empty() {
1110        return None;
1111    }
1112
1113    for candidate in candidate_constant_names(var_name) {
1114        let escaped = regex::escape(&candidate);
1115        let pattern = format!(r#"(?m)^\s*{}\s*=\s*(\[[^\n]+\])"#, escaped);
1116        let Ok(re) = Regex::new(&pattern) else {
1117            continue;
1118        };
1119
1120        for context in contexts {
1121            if let Some(caps) = re.captures(context)
1122                && let Some(raw) = caps.get(1)
1123            {
1124                let values = extract_ruby_array(raw.as_str());
1125                if !values.is_empty() {
1126                    return Some(values);
1127                }
1128            }
1129        }
1130    }
1131
1132    None
1133}
1134
1135fn candidate_constant_names(var_name: &str) -> Vec<String> {
1136    let mut names = vec![var_name.to_string()];
1137    if let Some(last) = var_name.split("::").last()
1138        && last != var_name
1139    {
1140        names.push(last.to_string());
1141    }
1142    names
1143}
1144
1145fn load_required_ruby_contexts(content: &str, base_dir: Option<&Path>) -> Vec<String> {
1146    let mut contexts = vec![content.to_string()];
1147    let Some(base_dir) = base_dir else {
1148        return contexts;
1149    };
1150
1151    let require_re = match Regex::new(r#"(?m)^\s*require(?:_relative)?\s+["']([^"']+)["']"#) {
1152        Ok(re) => re,
1153        Err(_) => return contexts,
1154    };
1155
1156    for caps in require_re.captures_iter(content) {
1157        let Some(required) = caps.get(1).map(|m| m.as_str()) else {
1158            continue;
1159        };
1160        for candidate in candidate_require_paths(base_dir, required) {
1161            if let Ok(required_content) = fs::read_to_string(&candidate) {
1162                contexts.push(required_content);
1163                break;
1164            }
1165        }
1166    }
1167
1168    contexts
1169}
1170
1171fn candidate_require_paths(base_dir: &Path, required: &str) -> Vec<PathBuf> {
1172    let relative = required.replace("::", "/");
1173    let filename = if relative.ends_with(".rb") {
1174        relative
1175    } else {
1176        format!("{}.rb", relative)
1177    };
1178
1179    vec![
1180        base_dir.join(&filename),
1181        base_dir.join("lib").join(&filename),
1182    ]
1183}
1184
1185fn looks_like_constant_reference(s: &str) -> bool {
1186    s.contains("::") || s.chars().next().is_some_and(|c| c.is_ascii_uppercase())
1187}
1188
1189/// Parses a .gemspec file content and returns PackageData.
1190#[cfg(test)]
1191fn parse_gemspec(content: &str) -> PackageData {
1192    parse_gemspec_with_context(content, None)
1193}
1194
1195fn parse_gemspec_with_context(content: &str, base_dir: Option<&Path>) -> PackageData {
1196    let contexts = load_required_ruby_contexts(content, base_dir);
1197
1198    // Regex for spec.name = "value" or s.name = "value"
1199    // The spec variable name varies: spec, s, gem, etc.
1200    let field_re = match Regex::new(
1201        r#"(?m)^\s*\w+\.(name|version|summary|description|homepage|license)\s*=\s*(.+)$"#,
1202    ) {
1203        Ok(r) => r,
1204        Err(e) => {
1205            warn!("Failed to compile gemspec field regex: {}", e);
1206            return default_package_data_with_datasource(DatasourceId::Gemspec);
1207        }
1208    };
1209
1210    let licenses_re = match Regex::new(r#"(?m)^\s*\w+\.licenses\s*=\s*(.+)$"#) {
1211        Ok(r) => r,
1212        Err(e) => {
1213            warn!("Failed to compile licenses regex: {}", e);
1214            return default_package_data_with_datasource(DatasourceId::Gemspec);
1215        }
1216    };
1217
1218    let authors_re = match Regex::new(r#"(?m)^\s*\w+\.(?:authors|author)\s*=\s*(.+)$"#) {
1219        Ok(r) => r,
1220        Err(e) => {
1221            warn!("Failed to compile authors regex: {}", e);
1222            return default_package_data_with_datasource(DatasourceId::Gemspec);
1223        }
1224    };
1225
1226    let email_re = match Regex::new(r#"(?m)^\s*\w+\.email\s*=\s*(.+)$"#) {
1227        Ok(r) => r,
1228        Err(e) => {
1229            warn!("Failed to compile email regex: {}", e);
1230            return default_package_data_with_datasource(DatasourceId::Gemspec);
1231        }
1232    };
1233
1234    let dependency_call_re = match Regex::new(
1235        r#"(?m)^\s*\w+\.(add_(?:development_|runtime_)?dependency)\s*\(?(.+?)\)?\s*$"#,
1236    ) {
1237        Ok(r) => r,
1238        Err(e) => {
1239            warn!("Failed to compile gemspec dependency regex: {}", e);
1240            return default_package_data_with_datasource(DatasourceId::Gemspec);
1241        }
1242    };
1243
1244    let mut name: Option<String> = None;
1245    let mut version: Option<String> = None;
1246    let mut summary: Option<String> = None;
1247    let mut description: Option<String> = None;
1248    let mut homepage: Option<String> = None;
1249    let mut license: Option<String> = None;
1250    let mut licenses: Vec<String> = Vec::new();
1251    let mut authors: Vec<String> = Vec::new();
1252    let mut emails: Vec<String> = Vec::new();
1253    let mut dependencies: Vec<Dependency> = Vec::new();
1254
1255    // Extract basic fields
1256    for caps in field_re.captures_iter(content) {
1257        let field_name = match caps.get(1) {
1258            Some(m) => m.as_str(),
1259            None => continue,
1260        };
1261        let raw_value = match caps.get(2) {
1262            Some(m) => m.as_str().trim(),
1263            None => continue,
1264        };
1265
1266        match field_name {
1267            "name" => {
1268                let cleaned = clean_gemspec_value(raw_value);
1269                name = if looks_like_constant_reference(&cleaned) {
1270                    resolve_variable_version(&cleaned, &contexts).or(Some(cleaned))
1271                } else {
1272                    Some(cleaned)
1273                }
1274            }
1275            "version" => {
1276                let cleaned = clean_gemspec_value(raw_value);
1277                // Bug #2: Check if version is a variable reference
1278                if looks_like_constant_reference(&cleaned) {
1279                    version = resolve_variable_version(&cleaned, &contexts).or(Some(cleaned));
1280                } else {
1281                    version = Some(cleaned);
1282                }
1283            }
1284            "summary" => {
1285                let cleaned = clean_gemspec_value(raw_value);
1286                summary = if looks_like_constant_reference(&cleaned) {
1287                    resolve_variable_version(&cleaned, &contexts).or(Some(cleaned))
1288                } else {
1289                    Some(cleaned)
1290                }
1291            }
1292            "description" => description = Some(clean_gemspec_value(raw_value)),
1293            "homepage" => {
1294                let cleaned = clean_gemspec_value(raw_value);
1295                homepage = if looks_like_constant_reference(&cleaned) {
1296                    resolve_variable_version(&cleaned, &contexts).or(Some(cleaned))
1297                } else {
1298                    Some(cleaned)
1299                }
1300            }
1301            "license" => license = Some(clean_gemspec_value(raw_value)),
1302            _ => {}
1303        }
1304    }
1305
1306    // Extract licenses (plural)
1307    for caps in licenses_re.captures_iter(content) {
1308        if let Some(raw) = caps.get(1) {
1309            licenses = extract_ruby_array(raw.as_str());
1310        }
1311    }
1312
1313    // Extract authors
1314    for caps in authors_re.captures_iter(content) {
1315        if let Some(raw) = caps.get(1) {
1316            let raw_str = raw.as_str().trim();
1317            if raw_str.starts_with('[') {
1318                authors = extract_ruby_array(raw_str);
1319            } else if looks_like_constant_reference(raw_str) {
1320                authors = resolve_variable_array(raw_str, &contexts)
1321                    .unwrap_or_else(|| vec![clean_gemspec_value(raw_str)]);
1322            } else {
1323                authors.push(clean_gemspec_value(raw_str));
1324            }
1325        }
1326    }
1327
1328    // Extract emails
1329    for caps in email_re.captures_iter(content) {
1330        if let Some(raw) = caps.get(1) {
1331            let raw_str = raw.as_str().trim();
1332            if raw_str.starts_with('[') {
1333                emails = extract_ruby_array(raw_str);
1334            } else if looks_like_constant_reference(raw_str) {
1335                emails = resolve_variable_array(raw_str, &contexts)
1336                    .unwrap_or_else(|| vec![clean_gemspec_value(raw_str)]);
1337            } else {
1338                emails.push(clean_gemspec_value(raw_str));
1339            }
1340        }
1341    }
1342
1343    // Build parties from authors and emails
1344    let mut parties: Vec<Party> = Vec::new();
1345
1346    if authors.len() == 1 && emails.len() == 1 {
1347        let email_str = emails.first().map(String::as_str);
1348        let (parsed_email_name, parsed_email) = match email_str {
1349            Some(e) => split_name_email(e),
1350            None => (None, None),
1351        };
1352
1353        parties.push(Party {
1354            r#type: Some("person".to_string()),
1355            role: Some("author".to_string()),
1356            name: authors.first().cloned().or(parsed_email_name),
1357            email: parsed_email.or_else(|| {
1358                email_str
1359                    .filter(|e| e.contains('@') && !e.contains('<'))
1360                    .map(|e| e.to_string())
1361            }),
1362            url: None,
1363            organization: None,
1364            organization_url: None,
1365            timezone: None,
1366        });
1367    } else {
1368        for author_name in authors {
1369            parties.push(Party {
1370                r#type: Some("person".to_string()),
1371                role: Some("author".to_string()),
1372                name: Some(author_name),
1373                email: None,
1374                url: None,
1375                organization: None,
1376                organization_url: None,
1377                timezone: None,
1378            });
1379        }
1380
1381        for email_str in emails {
1382            let (parsed_email_name, parsed_email) = if email_str.contains('<') {
1383                split_name_email(&email_str)
1384            } else {
1385                (None, None)
1386            };
1387            parties.push(Party {
1388                r#type: Some("person".to_string()),
1389                role: Some("author".to_string()),
1390                name: parsed_email_name,
1391                email: parsed_email.or_else(|| email_str.contains('@').then_some(email_str)),
1392                url: None,
1393                organization: None,
1394                organization_url: None,
1395                timezone: None,
1396            });
1397        }
1398    }
1399
1400    for caps in dependency_call_re.captures_iter(content) {
1401        let method = match caps.get(1) {
1402            Some(m) => m.as_str(),
1403            None => continue,
1404        };
1405        let args = match caps.get(2) {
1406            Some(m) => m.as_str(),
1407            None => continue,
1408        };
1409
1410        let Some(dep_name) = extract_first_ruby_value(args) else {
1411            continue;
1412        };
1413        let version_parts = extract_all_ruby_values(after_first_argument(args));
1414        let extracted_requirement = if version_parts.is_empty() {
1415            None
1416        } else {
1417            Some(version_parts.join(", "))
1418        };
1419        let purl = create_gem_purl(&dep_name, None);
1420        let is_development = method == "add_development_dependency";
1421        let scope = if is_development {
1422            "development"
1423        } else {
1424            "runtime"
1425        };
1426
1427        dependencies.push(Dependency {
1428            purl,
1429            extracted_requirement,
1430            scope: Some(scope.to_string()),
1431            is_runtime: Some(!is_development),
1432            is_optional: Some(is_development),
1433            is_pinned: None,
1434            is_direct: Some(true),
1435            resolved_package: None,
1436            extra_data: None,
1437        });
1438    }
1439
1440    // Extract license statement only - detection happens in separate engine
1441    let extracted_license_statement = if !licenses.is_empty() {
1442        Some(licenses.join(" AND "))
1443    } else {
1444        license
1445    };
1446
1447    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
1448        normalize_spdx_declared_license(extracted_license_statement.as_deref());
1449
1450    // Prefer description over summary
1451    let final_description = description.or(summary);
1452
1453    // Build PURL
1454    let purl = name
1455        .as_deref()
1456        .map(|n| create_gem_purl(n, version.as_deref()))
1457        .unwrap_or(None);
1458
1459    let (repository_homepage_url, repository_download_url, api_data_url, download_url) =
1460        if let Some(n) = name.as_deref() {
1461            get_rubygems_urls(n, version.as_deref(), None)
1462        } else {
1463            (None, None, None, None)
1464        };
1465
1466    PackageData {
1467        package_type: Some(PACKAGE_TYPE),
1468        name,
1469        version,
1470        primary_language: Some("Ruby".to_string()),
1471        description: final_description,
1472        homepage_url: homepage,
1473        download_url,
1474        declared_license_expression,
1475        declared_license_expression_spdx,
1476        license_detections,
1477        extracted_license_statement,
1478        parties,
1479        dependencies,
1480        repository_homepage_url,
1481        repository_download_url,
1482        api_data_url,
1483        datasource_id: Some(DatasourceId::Gemspec),
1484        purl,
1485        ..default_package_data()
1486    }
1487}
1488
1489// =============================================================================
1490// .gem Archive Parser (Wave 3)
1491// =============================================================================
1492
1493const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; // 100MB
1494const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; // 50MB per file
1495const MAX_COMPRESSION_RATIO: f64 = 100.0; // 100:1 ratio
1496
1497/// Parser for .gem archive files.
1498///
1499/// Extracts metadata from Ruby .gem packages, which are tar archives
1500/// containing a gzip-compressed YAML metadata file (`metadata.gz`).
1501///
1502/// Includes safety checks against zip bombs and oversized archives.
1503pub struct GemArchiveParser;
1504
1505impl PackageParser for GemArchiveParser {
1506    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
1507
1508    fn extract_packages(path: &Path) -> Vec<PackageData> {
1509        vec![match extract_gem_archive(path) {
1510            Ok(data) => data,
1511            Err(e) => {
1512                warn!("Failed to extract .gem archive at {:?}: {}", path, e);
1513                default_package_data_with_datasource(DatasourceId::GemArchive)
1514            }
1515        }]
1516    }
1517
1518    fn is_match(path: &Path) -> bool {
1519        path.extension()
1520            .and_then(|ext| ext.to_str())
1521            .is_some_and(|ext| ext == "gem")
1522    }
1523}
1524
1525fn extract_gem_archive(path: &Path) -> Result<PackageData, String> {
1526    let file_metadata =
1527        fs::metadata(path).map_err(|e| format!("Failed to read file metadata: {}", e))?;
1528    let archive_size = file_metadata.len();
1529
1530    if archive_size > MAX_ARCHIVE_SIZE {
1531        return Err(format!(
1532            "Archive too large: {} bytes (limit: {} bytes)",
1533            archive_size, MAX_ARCHIVE_SIZE
1534        ));
1535    }
1536
1537    let file = File::open(path).map_err(|e| format!("Failed to open archive: {}", e))?;
1538    let mut archive = Archive::new(file);
1539
1540    for entry_result in archive
1541        .entries()
1542        .map_err(|e| format!("Failed to read tar entries: {}", e))?
1543    {
1544        let entry = entry_result.map_err(|e| format!("Failed to read tar entry: {}", e))?;
1545        let entry_path = entry
1546            .path()
1547            .map_err(|e| format!("Failed to get entry path: {}", e))?;
1548
1549        if entry_path.to_str() == Some("metadata.gz") {
1550            let entry_size = entry.size();
1551            if entry_size > MAX_FILE_SIZE {
1552                return Err(format!(
1553                    "metadata.gz too large: {} bytes (limit: {} bytes)",
1554                    entry_size, MAX_FILE_SIZE
1555                ));
1556            }
1557
1558            let mut decoder = GzDecoder::new(entry);
1559            let mut content = String::new();
1560            decoder
1561                .read_to_string(&mut content)
1562                .map_err(|e| format!("Failed to decompress metadata.gz: {}", e))?;
1563
1564            let uncompressed_size = content.len() as u64;
1565            if entry_size > 0 {
1566                let ratio = uncompressed_size as f64 / entry_size as f64;
1567                if ratio > MAX_COMPRESSION_RATIO {
1568                    return Err(format!(
1569                        "Suspicious compression ratio: {:.2}:1 (limit: {:.0}:1)",
1570                        ratio, MAX_COMPRESSION_RATIO
1571                    ));
1572                }
1573            }
1574            if uncompressed_size > MAX_FILE_SIZE {
1575                return Err(format!(
1576                    "Decompressed metadata too large: {} bytes (limit: {} bytes)",
1577                    uncompressed_size, MAX_FILE_SIZE
1578                ));
1579            }
1580
1581            return parse_gem_metadata_yaml(&content, DatasourceId::GemArchive);
1582        }
1583    }
1584
1585    Err("metadata.gz not found in .gem archive".to_string())
1586}
1587
1588fn parse_gem_metadata_yaml(
1589    content: &str,
1590    datasource_id: DatasourceId,
1591) -> Result<PackageData, String> {
1592    // Ruby YAML tagged types need to be handled:
1593    // --- !ruby/object:Gem::Specification
1594    // We strip Ruby-specific YAML tags since serde_yaml can't handle them
1595    let cleaned = clean_ruby_yaml_tags(content);
1596
1597    let yaml: serde_yaml::Value =
1598        serde_yaml::from_str(&cleaned).map_err(|e| format!("Failed to parse YAML: {}", e))?;
1599
1600    let name = yaml_string(&yaml, "name");
1601    let version = yaml.get("version").and_then(|v| {
1602        // version can be a simple string or a mapping with a "version" key
1603        if v.is_string() {
1604            v.as_str().map(|s| s.to_string())
1605        } else {
1606            yaml_string(v, "version")
1607        }
1608    });
1609    let description = yaml_string(&yaml, "description").or_else(|| yaml_string(&yaml, "summary"));
1610    let homepage = yaml_string(&yaml, "homepage");
1611    let summary = yaml_string(&yaml, "summary");
1612
1613    // Licenses
1614    let licenses: Vec<String> = yaml
1615        .get("licenses")
1616        .and_then(|v| v.as_sequence())
1617        .map(|seq| {
1618            seq.iter()
1619                .filter_map(|item| item.as_str().map(|s| s.to_string()))
1620                .collect()
1621        })
1622        .unwrap_or_default();
1623
1624    // Extract license statement only - detection happens in separate engine
1625    let extracted_license_statement = if !licenses.is_empty() {
1626        Some(licenses.join(" AND "))
1627    } else {
1628        None
1629    };
1630
1631    let (license_expression, license_expression_spdx, license_detections) =
1632        normalize_spdx_declared_license(extracted_license_statement.as_deref());
1633
1634    // Authors
1635    let authors: Vec<String> = yaml
1636        .get("authors")
1637        .and_then(|v| v.as_sequence())
1638        .map(|seq| {
1639            seq.iter()
1640                .filter_map(|item| item.as_str().map(|s| s.to_string()))
1641                .collect()
1642        })
1643        .unwrap_or_default();
1644
1645    let emails: Vec<String> = yaml
1646        .get("email")
1647        .map(|v| {
1648            if let Some(seq) = v.as_sequence() {
1649                seq.iter()
1650                    .filter_map(|item| item.as_str().map(|s| s.to_string()))
1651                    .collect()
1652            } else if let Some(s) = v.as_str() {
1653                vec![s.to_string()]
1654            } else {
1655                Vec::new()
1656            }
1657        })
1658        .unwrap_or_default();
1659
1660    // Build parties
1661    let mut parties: Vec<Party> = Vec::new();
1662    let max_len = authors.len().max(emails.len());
1663    for i in 0..max_len {
1664        let author_name = authors.get(i).map(|s| s.as_str());
1665        let email_str = emails.get(i).map(|s| s.as_str());
1666
1667        let (parsed_email_name, parsed_email) = match email_str {
1668            Some(e) if e.contains('<') => split_name_email(e),
1669            None => (None, None),
1670            _ => (None, None),
1671        };
1672
1673        let party_name = author_name.map(|s| s.to_string()).or(parsed_email_name);
1674
1675        parties.push(Party {
1676            r#type: Some("person".to_string()),
1677            role: Some("author".to_string()),
1678            name: party_name,
1679            email: parsed_email.or_else(|| {
1680                email_str
1681                    .filter(|e| e.contains('@') && !e.contains('<'))
1682                    .map(|e| e.to_string())
1683            }),
1684            url: None,
1685            organization: None,
1686            organization_url: None,
1687            timezone: None,
1688        });
1689    }
1690
1691    // Dependencies
1692    let dependencies = parse_gem_yaml_dependencies(&yaml);
1693
1694    let metadata = yaml.get("metadata");
1695
1696    let bug_tracking_url = metadata.and_then(|m| yaml_string(m, "bug_tracking_uri"));
1697
1698    let code_view_url = metadata.and_then(|m| yaml_string(m, "source_code_uri"));
1699
1700    let vcs_url = code_view_url
1701        .clone()
1702        .or_else(|| metadata.and_then(|m| yaml_string(m, "homepage_uri")));
1703
1704    let file_references = metadata
1705        .and_then(|m| m.get("files"))
1706        .and_then(|f| f.as_sequence())
1707        .map(|seq| {
1708            seq.iter()
1709                .filter_map(|v| v.as_str())
1710                .map(|s| crate::models::FileReference {
1711                    path: s.to_string(),
1712                    size: None,
1713                    sha1: None,
1714                    md5: None,
1715                    sha256: None,
1716                    sha512: None,
1717                    extra_data: None,
1718                })
1719                .collect::<Vec<_>>()
1720        })
1721        .unwrap_or_default();
1722
1723    let release_date = yaml_string(&yaml, "date").and_then(|d| {
1724        if d.len() >= 10 {
1725            Some(d[..10].to_string())
1726        } else {
1727            None
1728        }
1729    });
1730
1731    let purl = name
1732        .as_deref()
1733        .map(|n| create_gem_purl(n, version.as_deref()))
1734        .unwrap_or(None);
1735
1736    let platform = yaml_string(&yaml, "platform");
1737    let (repository_homepage_url, repository_download_url, api_data_url, download_url) =
1738        if let Some(n) = name.as_deref() {
1739            get_rubygems_urls(n, version.as_deref(), platform.as_deref())
1740        } else {
1741            (None, None, None, None)
1742        };
1743
1744    let qualifiers = if let Some(ref p) = platform {
1745        if p != "ruby" {
1746            let mut q = HashMap::new();
1747            q.insert("platform".to_string(), p.clone());
1748            Some(q)
1749        } else {
1750            None
1751        }
1752    } else {
1753        None
1754    };
1755
1756    Ok(PackageData {
1757        package_type: Some(PACKAGE_TYPE),
1758        name,
1759        version,
1760        qualifiers,
1761        primary_language: Some("Ruby".to_string()),
1762        description: description.or(summary),
1763        release_date,
1764        homepage_url: homepage,
1765        download_url,
1766        bug_tracking_url,
1767        code_view_url,
1768        declared_license_expression: license_expression,
1769        declared_license_expression_spdx: license_expression_spdx,
1770        license_detections,
1771        extracted_license_statement,
1772        file_references,
1773        parties,
1774        dependencies,
1775        repository_homepage_url,
1776        repository_download_url,
1777        api_data_url,
1778        datasource_id: Some(datasource_id),
1779        purl,
1780        vcs_url,
1781        ..default_package_data()
1782    })
1783}
1784
1785/// Strips Ruby-specific YAML tags that serde_yaml cannot handle.
1786fn clean_ruby_yaml_tags(content: &str) -> String {
1787    let tag_re = match Regex::new(r"!ruby/\S+") {
1788        Ok(r) => r,
1789        Err(_) => return content.to_string(),
1790    };
1791    tag_re.replace_all(content, "").to_string()
1792}
1793
1794fn yaml_string(yaml: &serde_yaml::Value, key: &str) -> Option<String> {
1795    yaml.get(key)
1796        .and_then(|v| v.as_str())
1797        .filter(|s| !s.is_empty())
1798        .map(|s| s.to_string())
1799}
1800
1801fn parse_gem_yaml_dependencies(yaml: &serde_yaml::Value) -> Vec<Dependency> {
1802    let mut dependencies = Vec::new();
1803
1804    let deps_seq = match yaml.get("dependencies").and_then(|v| v.as_sequence()) {
1805        Some(seq) => seq,
1806        None => return dependencies,
1807    };
1808
1809    for dep_value in deps_seq {
1810        let dep_name = match yaml_string(dep_value, "name") {
1811            Some(n) => n,
1812            None => continue,
1813        };
1814
1815        let dep_type = yaml_string(dep_value, "type");
1816        let is_development = dep_type.as_deref() == Some(":development");
1817
1818        // Extract version requirements from the nested structure
1819        let requirements = dep_value
1820            .get("requirement")
1821            .or_else(|| dep_value.get("version_requirements"))
1822            .and_then(|req| req.get("requirements"))
1823            .and_then(|reqs| reqs.as_sequence());
1824
1825        let extracted_requirement = requirements.map(|reqs| {
1826            let parts: Vec<String> = reqs
1827                .iter()
1828                .filter_map(|req| {
1829                    let seq = req.as_sequence()?;
1830                    if seq.len() >= 2 {
1831                        let op = seq[0].as_str().unwrap_or("");
1832                        let ver = seq[1].get("version").and_then(|v| v.as_str()).unwrap_or("");
1833                        if op == ">=" && ver == "0" {
1834                            // ">= 0" means "any version" - skip
1835                            None
1836                        } else if op.is_empty() || ver.is_empty() {
1837                            None
1838                        } else {
1839                            Some(format!("{} {}", op, ver))
1840                        }
1841                    } else {
1842                        None
1843                    }
1844                })
1845                .collect();
1846            parts.join(", ")
1847        });
1848
1849        let extracted_requirement = extracted_requirement
1850            .filter(|s| !s.is_empty())
1851            .or_else(|| Some(String::new()));
1852
1853        let (scope, is_runtime, is_optional) = if is_development {
1854            (Some("development".to_string()), false, true)
1855        } else {
1856            (Some("runtime".to_string()), true, false)
1857        };
1858
1859        let purl = create_gem_purl(&dep_name, None);
1860
1861        dependencies.push(Dependency {
1862            purl,
1863            extracted_requirement,
1864            scope,
1865            is_runtime: Some(is_runtime),
1866            is_optional: Some(is_optional),
1867            is_pinned: None,
1868            is_direct: Some(true),
1869            resolved_package: None,
1870            extra_data: None,
1871        });
1872    }
1873
1874    dependencies
1875}
1876
1877// =============================================================================
1878// Gem Metadata Extracted Parser (metadata.gz-extract files)
1879// =============================================================================
1880
1881pub struct GemMetadataExtractedParser;
1882
1883impl PackageParser for GemMetadataExtractedParser {
1884    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
1885
1886    fn extract_packages(path: &Path) -> Vec<PackageData> {
1887        vec![match extract_gem_metadata_extracted(path) {
1888            Ok(data) => data,
1889            Err(e) => {
1890                warn!("Failed to extract gem metadata from {:?}: {}", path, e);
1891                default_package_data_with_datasource(DatasourceId::GemArchiveExtracted)
1892            }
1893        }]
1894    }
1895
1896    fn is_match(path: &Path) -> bool {
1897        path.to_str()
1898            .is_some_and(|p| p.contains("metadata.gz-extract"))
1899    }
1900}
1901
1902fn extract_gem_metadata_extracted(path: &Path) -> Result<PackageData, String> {
1903    let content = fs::read_to_string(path)
1904        .map_err(|e| format!("Failed to read metadata.gz-extract file: {}", e))?;
1905
1906    parse_gem_metadata_yaml(&content, DatasourceId::GemArchiveExtracted)
1907}
1908
1909// Register parser with metadata
1910crate::register_parser!(
1911    "Ruby Gemfile manifest",
1912    &["**/Gemfile", "**/data.gz-extract/Gemfile"],
1913    "gem",
1914    "Ruby",
1915    Some("https://bundler.io/man/gemfile.5.html"),
1916);
1917
1918crate::register_parser!(
1919    "Ruby Gemfile.lock lockfile",
1920    &["**/Gemfile.lock", "**/data.gz-extract/Gemfile.lock"],
1921    "gem",
1922    "Ruby",
1923    Some("https://bundler.io/man/gemfile.5.html"),
1924);
1925
1926crate::register_parser!(
1927    "Ruby .gemspec manifest",
1928    &[
1929        "**/*.gemspec",
1930        "**/data.gz-extract/*.gemspec",
1931        "**/specifications/*.gemspec"
1932    ],
1933    "gem",
1934    "Ruby",
1935    Some("https://guides.rubygems.org/specification-reference/"),
1936);
1937
1938crate::register_parser!(
1939    "Ruby .gem archive",
1940    &["**/*.gem"],
1941    "gem",
1942    "Ruby",
1943    Some("https://guides.rubygems.org/specification-reference/"),
1944);
1945
1946crate::register_parser!(
1947    "Ruby gem metadata (extracted)",
1948    &["**/metadata.gz-extract"],
1949    "gem",
1950    "Ruby",
1951    Some("https://guides.rubygems.org/specification-reference/"),
1952);
1953
1954#[cfg(test)]
1955mod tests {
1956    use super::parse_gemspec;
1957
1958    #[test]
1959    fn test_clean_gemspec_value_handles_unterminated_percent_q() {
1960        assert_eq!(
1961            super::clean_gemspec_value("%q{Arel is a SQL AST manager for Ruby. It"),
1962            "Arel is a SQL AST manager for Ruby. It"
1963        );
1964    }
1965
1966    #[test]
1967    fn test_parse_gemspec_runtime_dependency_scope() {
1968        let content = r#"
1969Gem::Specification.new do |spec|
1970  spec.name = "demo"
1971  spec.version = "1.0.0"
1972  spec.add_runtime_dependency "rack", "~> 3.0"
1973  spec.add_dependency "thor", ">= 1.0"
1974end
1975"#;
1976
1977        let package_data = parse_gemspec(content);
1978        assert_eq!(package_data.dependencies.len(), 2);
1979        assert_eq!(
1980            package_data.dependencies[0].scope,
1981            Some("runtime".to_string())
1982        );
1983        assert_eq!(
1984            package_data.dependencies[0].extracted_requirement,
1985            Some("~> 3.0".to_string())
1986        );
1987        assert_eq!(
1988            package_data.dependencies[1].scope,
1989            Some("runtime".to_string())
1990        );
1991        assert_eq!(
1992            package_data.dependencies[1].extracted_requirement,
1993            Some(">= 1.0".to_string())
1994        );
1995    }
1996}