Skip to main content

provenant/parsers/
ruby.rs

1//! Parser for Ruby/RubyGems package manifests.
2//!
3//! Extracts package metadata, dependencies, and platform information from
4//! Gemfile and Gemfile.lock files used by Ruby/Bundler projects.
5//!
6//! # Supported Formats
7//! - Gemfile (manifest with Ruby DSL)
8//! - Gemfile.lock (lockfile with state machine sections)
9//! - *.gemspec (gem specification files)
10//! - *.gem (gem archive packages)
11//! - metadata.gz-extract (pre-extracted gem metadata)
12//!
13//! # Key Features
14//! - State machine parsing for Gemfile.lock sections (GEM, GIT, PATH, SVN, PLATFORMS, BUNDLED WITH, DEPENDENCIES)
15//! - Regex-based Ruby DSL parsing for Gemfile
16//! - Dependency group handling (:development, :test, etc.)
17//! - Platform-specific gem support
18//! - Pessimistic version operator (~>) support
19//! - Bug Fix #1: Strip .freeze suffix from strings
20//! - Bug Fix #4: Correct dependency scope mapping (:runtime → None, :development → "development")
21//!
22//! # Implementation Notes
23//! - Uses regex for pattern matching (not full Ruby AST)
24//! - Graceful error handling: logs warnings and returns default on parse failure
25//! - PURL type: "gem"
26
27use crate::models::{DatasourceId, Dependency, PackageData, PackageType, Party};
28use crate::parsers::utils::split_name_email;
29use flate2::read::GzDecoder;
30use log::warn;
31use packageurl::PackageUrl;
32use regex::Regex;
33use std::collections::HashMap;
34use std::fs::{self, File};
35use std::io::Read;
36use std::path::{Path, PathBuf};
37use tar::Archive;
38
39use super::PackageParser;
40
41const PACKAGE_TYPE: PackageType = PackageType::Gem;
42
43// =============================================================================
44// Bug Fix #1: Strip .freeze suffix from strings
45// =============================================================================
46
47/// Strips the `.freeze` suffix from Ruby frozen string literals.
48///
49/// In Ruby, `.freeze` makes a string immutable. We need to remove this suffix
50/// when parsing gem names and versions from Gemfile.
51///
52/// # Examples
53/// ```ignore
54/// assert_eq!(strip_freeze_suffix("\"name\".freeze"), "\"name\"");
55/// assert_eq!(strip_freeze_suffix("'1.0.0'.freeze"), "'1.0.0'");
56/// ```
57pub fn strip_freeze_suffix(s: &str) -> &str {
58    s.trim_end_matches(".freeze")
59}
60
61enum GemfileBlock {
62    Group(Vec<String>),
63    Source(String),
64}
65
66// =============================================================================
67// Gemfile Parser (Ruby DSL)
68// =============================================================================
69
70/// Ruby Gemfile parser for manifest files.
71///
72/// Parses Ruby DSL syntax to extract gem declarations, dependency groups,
73/// platform-specific gems, and version constraints.
74pub struct GemfileParser;
75
76impl PackageParser for GemfileParser {
77    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
78
79    fn extract_packages(path: &Path) -> Vec<PackageData> {
80        let content = match fs::read_to_string(path) {
81            Ok(c) => c,
82            Err(e) => {
83                warn!("Failed to read Gemfile at {:?}: {}", path, e);
84                return vec![default_package_data_with_datasource(DatasourceId::Gemfile)];
85            }
86        };
87
88        vec![parse_gemfile(&content)]
89    }
90
91    fn is_match(path: &Path) -> bool {
92        path.file_name()
93            .and_then(|n| n.to_str())
94            .is_some_and(|name| name == "Gemfile")
95            || path
96                .to_str()
97                .is_some_and(|p| p.contains("data.gz-extract/") && p.ends_with("/Gemfile"))
98    }
99}
100
101/// Parses Gemfile content and extracts dependencies with groups.
102fn parse_gemfile(content: &str) -> PackageData {
103    let mut dependencies = Vec::new();
104    let mut block_stack = Vec::new();
105    let mut default_source = None;
106    let mut sources = Vec::new();
107
108    // Regex patterns for Gemfile parsing
109    // gem "name", "version", options...
110    let gem_regex = match Regex::new(
111        r#"^\s*gem\s+["']([^"']+)["'](?:\.freeze)?(?:\s*,\s*["']([^"']+)["'](?:\.freeze)?)?(?:\s*,\s*["']([^"']+)["'](?:\.freeze)?)?(?:\s*,\s*(.+))?"#,
112    ) {
113        Ok(r) => r,
114        Err(e) => {
115            warn!("Failed to compile gem regex: {}", e);
116            return default_package_data_with_datasource(DatasourceId::Gemfile);
117        }
118    };
119
120    // group :name do ... end
121    let group_start_regex = match Regex::new(r"^\s*group\s+(.+?)\s+do\s*$") {
122        Ok(r) => r,
123        Err(e) => {
124            warn!("Failed to compile group regex: {}", e);
125            return default_package_data_with_datasource(DatasourceId::Gemfile);
126        }
127    };
128
129    let group_end_regex = match Regex::new(r"^\s*end\s*$") {
130        Ok(r) => r,
131        Err(e) => {
132            warn!("Failed to compile end regex: {}", e);
133            return default_package_data_with_datasource(DatasourceId::Gemfile);
134        }
135    };
136
137    let source_block_start_regex = match Regex::new(r#"^\s*source\s+["']([^"']+)["']\s+do\s*$"#) {
138        Ok(r) => r,
139        Err(e) => {
140            warn!("Failed to compile source block regex: {}", e);
141            return default_package_data_with_datasource(DatasourceId::Gemfile);
142        }
143    };
144
145    let source_regex = match Regex::new(r#"^\s*source\s+["']([^"']+)["']\s*$"#) {
146        Ok(r) => r,
147        Err(e) => {
148            warn!("Failed to compile source regex: {}", e);
149            return default_package_data_with_datasource(DatasourceId::Gemfile);
150        }
151    };
152
153    // Parse symbols like :development, :test
154    let symbol_regex = match Regex::new(r":(\w+)") {
155        Ok(r) => r,
156        Err(e) => {
157            warn!("Failed to compile symbol regex: {}", e);
158            return default_package_data_with_datasource(DatasourceId::Gemfile);
159        }
160    };
161
162    for line in content.lines() {
163        let trimmed = line.trim();
164
165        // Skip comments and empty lines
166        if trimmed.is_empty() || trimmed.starts_with('#') {
167            continue;
168        }
169
170        // Check for group start
171        if let Some(caps) = group_start_regex.captures(trimmed) {
172            let groups_str = caps.get(1).map(|m| m.as_str()).unwrap_or("");
173            let mut current_groups = Vec::new();
174            for cap in symbol_regex.captures_iter(groups_str) {
175                if let Some(group_name) = cap.get(1) {
176                    current_groups.push(group_name.as_str().to_string());
177                }
178            }
179            block_stack.push(GemfileBlock::Group(current_groups));
180            continue;
181        }
182
183        if let Some(caps) = source_block_start_regex.captures(trimmed) {
184            let source = caps
185                .get(1)
186                .map(|m| m.as_str().to_string())
187                .unwrap_or_default();
188            if !source.is_empty() {
189                push_unique_string(&mut sources, source.clone());
190                block_stack.push(GemfileBlock::Source(source));
191            }
192            continue;
193        }
194
195        if let Some(caps) = source_regex.captures(trimmed) {
196            if let Some(source) = caps.get(1).map(|m| m.as_str().to_string()) {
197                push_unique_string(&mut sources, source.clone());
198                default_source = Some(source);
199            }
200            continue;
201        }
202
203        // Check for group end
204        if group_end_regex.is_match(trimmed) {
205            block_stack.pop();
206            continue;
207        }
208
209        // Parse gem declaration
210        if let Some(caps) = gem_regex.captures(trimmed) {
211            let name = strip_freeze_suffix(caps.get(1).map(|m| m.as_str()).unwrap_or(""));
212            if name.is_empty() {
213                continue;
214            }
215
216            // Collect version constraints
217            let mut version_parts = Vec::new();
218            if let Some(v) = caps.get(2) {
219                version_parts.push(strip_freeze_suffix(v.as_str()).to_string());
220            }
221            if let Some(v) = caps.get(3) {
222                let v_str = strip_freeze_suffix(v.as_str());
223                // Check if it looks like a version constraint
224                if looks_like_version_constraint(v_str) {
225                    version_parts.push(v_str.to_string());
226                }
227            }
228
229            let extracted_requirement = if version_parts.is_empty() {
230                None
231            } else {
232                Some(version_parts.join(", "))
233            };
234
235            let current_groups = current_group_names(&block_stack);
236
237            // Determine scope based on current group
238            // Bug Fix #4: :runtime → None, :development → "development"
239            let (scope, is_runtime, is_optional) = if current_groups.is_empty() {
240                // No group = runtime dependency
241                (None, true, false)
242            } else if current_groups.iter().any(|g| g == "development") {
243                (Some("development".to_string()), false, true)
244            } else if current_groups.iter().any(|g| g == "test") {
245                (Some("test".to_string()), false, true)
246            } else {
247                // Other groups (e.g., :production)
248                let group = current_groups.first().cloned();
249                (group, true, false)
250            };
251
252            // Create PURL
253            let purl = create_gem_purl(name, None);
254            let inherited_source = current_source(&block_stack, default_source.as_deref());
255            let extra_data = build_gemfile_dependency_extra_data(
256                caps.get(4).map(|m| m.as_str()),
257                inherited_source.as_deref(),
258            );
259
260            dependencies.push(Dependency {
261                purl,
262                extracted_requirement,
263                scope,
264                is_runtime: Some(is_runtime),
265                is_optional: Some(is_optional),
266                is_pinned: None,
267                is_direct: Some(true),
268                resolved_package: None,
269                extra_data,
270            });
271        }
272    }
273
274    let extra_data = if sources.is_empty() {
275        None
276    } else {
277        Some(HashMap::from([(
278            "sources".to_string(),
279            serde_json::Value::Array(sources.into_iter().map(serde_json::Value::String).collect()),
280        )]))
281    };
282
283    PackageData {
284        package_type: Some(PACKAGE_TYPE),
285        primary_language: Some("Ruby".to_string()),
286        dependencies,
287        extra_data,
288        datasource_id: Some(DatasourceId::Gemfile),
289        ..default_package_data()
290    }
291}
292
293fn current_group_names(block_stack: &[GemfileBlock]) -> Vec<String> {
294    block_stack
295        .iter()
296        .rev()
297        .find_map(|block| match block {
298            GemfileBlock::Group(groups) => Some(groups.clone()),
299            GemfileBlock::Source(_) => None,
300        })
301        .unwrap_or_default()
302}
303
304fn current_source(block_stack: &[GemfileBlock], default_source: Option<&str>) -> Option<String> {
305    block_stack
306        .iter()
307        .rev()
308        .find_map(|block| match block {
309            GemfileBlock::Source(source) => Some(source.clone()),
310            GemfileBlock::Group(_) => None,
311        })
312        .or_else(|| default_source.map(str::to_string))
313}
314
315fn push_unique_string(values: &mut Vec<String>, value: String) {
316    if !values.contains(&value) {
317        values.push(value);
318    }
319}
320
321fn build_gemfile_dependency_extra_data(
322    options: Option<&str>,
323    inherited_source: Option<&str>,
324) -> Option<HashMap<String, serde_json::Value>> {
325    let mut extra = HashMap::new();
326    let options = options.unwrap_or("");
327
328    if let Some(git) = extract_gemfile_quoted_option(options, "git") {
329        extra.insert(
330            "source_type".to_string(),
331            serde_json::Value::String("GIT".to_string()),
332        );
333        extra.insert("git".to_string(), serde_json::Value::String(git.clone()));
334        extra.insert("remote".to_string(), serde_json::Value::String(git));
335    }
336
337    if let Some(path) = extract_gemfile_quoted_option(options, "path") {
338        extra.insert(
339            "source_type".to_string(),
340            serde_json::Value::String("PATH".to_string()),
341        );
342        extra.insert("path".to_string(), serde_json::Value::String(path));
343    }
344
345    for key in ["branch", "ref", "tag"] {
346        if let Some(value) = extract_gemfile_quoted_option(options, key) {
347            extra.insert(key.to_string(), serde_json::Value::String(value));
348        }
349    }
350
351    let direct_source = extract_gemfile_quoted_option(options, "source");
352    if let Some(source) = direct_source {
353        extra.insert("source".to_string(), serde_json::Value::String(source));
354    } else if !extra.contains_key("source_type")
355        && let Some(source) = inherited_source
356    {
357        extra.insert(
358            "source".to_string(),
359            serde_json::Value::String(source.to_string()),
360        );
361    }
362
363    (!extra.is_empty()).then_some(extra)
364}
365
366fn extract_gemfile_quoted_option(options: &str, key: &str) -> Option<String> {
367    if options.is_empty() {
368        return None;
369    }
370
371    let pattern = format!(r#"(?:^|,\s*){}\s*:\s*["']([^"']+)["']"#, regex::escape(key));
372    Regex::new(&pattern)
373        .ok()
374        .and_then(|regex| regex.captures(options))
375        .and_then(|captures| captures.get(1).map(|m| m.as_str().to_string()))
376}
377
378/// Checks if a string looks like a version constraint.
379fn looks_like_version_constraint(s: &str) -> bool {
380    s.starts_with('~')
381        || s.starts_with('>')
382        || s.starts_with('<')
383        || s.starts_with('=')
384        || s.starts_with('!')
385        || s.chars().next().is_some_and(|c| c.is_ascii_digit())
386}
387
388// =============================================================================
389// Gemfile.lock Parser (State Machine)
390// =============================================================================
391
392/// Ruby Gemfile.lock parser for lockfiles.
393///
394/// Uses a state machine to parse sections: GEM, GIT, PATH, SVN,
395/// PLATFORMS, BUNDLED WITH, DEPENDENCIES.
396pub struct GemfileLockParser;
397
398impl PackageParser for GemfileLockParser {
399    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
400
401    fn extract_packages(path: &Path) -> Vec<PackageData> {
402        let content = match fs::read_to_string(path) {
403            Ok(c) => c,
404            Err(e) => {
405                warn!("Failed to read Gemfile.lock at {:?}: {}", path, e);
406                return vec![default_package_data_with_datasource(
407                    DatasourceId::GemfileLock,
408                )];
409            }
410        };
411
412        vec![parse_gemfile_lock(&content)]
413    }
414
415    fn is_match(path: &Path) -> bool {
416        path.file_name()
417            .and_then(|n| n.to_str())
418            .is_some_and(|name| name == "Gemfile.lock")
419            || path
420                .to_str()
421                .is_some_and(|p| p.contains("data.gz-extract/") && p.ends_with("/Gemfile.lock"))
422    }
423}
424
425/// Parse state for Gemfile.lock state machine.
426#[derive(Debug, Clone, PartialEq)]
427enum ParseState {
428    None,
429    Gem,
430    Git,
431    Path,
432    Svn,
433    Specs,
434    Platforms,
435    BundledWith,
436    Dependencies,
437}
438
439/// Parsed gem information from Gemfile.lock.
440///
441/// All fields are actively used:
442/// - `gem_type`, `remote`, `revision`, `ref_field`, `branch`, `tag`: Stored in extra_data for GIT/PATH/SVN sources
443/// - `name`, `version`, `platform`, `pinned`: Used for dependency PURL and metadata generation
444/// - `requirements`: Stored as extracted_requirement for version constraints
445#[derive(Debug, Clone, Default)]
446struct GemInfo {
447    name: String,
448    version: Option<String>,
449    platform: Option<String>,
450    gem_type: String,
451    remote: Option<String>,
452    revision: Option<String>,
453    ref_field: Option<String>,
454    branch: Option<String>,
455    tag: Option<String>,
456    pinned: bool,
457    requirements: Vec<String>,
458}
459
460/// Parses Gemfile.lock content using a state machine.
461fn parse_gemfile_lock(content: &str) -> PackageData {
462    let mut state = ParseState::None;
463    let mut dependencies = Vec::new();
464    let mut gems: HashMap<String, GemInfo> = HashMap::new();
465    let mut platforms: Vec<String> = Vec::new();
466    let mut bundler_version: Option<String> = None;
467    let mut current_gem_type = String::new();
468    let mut current_remote: Option<String> = None;
469    let mut current_options: HashMap<String, String> = HashMap::new();
470
471    // DEPS pattern: 2 spaces at line start
472    let deps_regex = match Regex::new(r"^ {2}([^ \)\(,!:]+)(?: \(([^)]+)\))?(!)?$") {
473        Ok(r) => r,
474        Err(e) => {
475            warn!("Failed to compile deps regex: {}", e);
476            return default_package_data_with_datasource(DatasourceId::GemfileLock);
477        }
478    };
479
480    // SPEC_DEPS pattern: 4 spaces at line start
481    let spec_deps_regex = match Regex::new(r"^ {4}([^ \)\(,!:]+)(?: \(([^)]+)\))?$") {
482        Ok(r) => r,
483        Err(e) => {
484            warn!("Failed to compile spec_deps regex: {}", e);
485            return default_package_data_with_datasource(DatasourceId::GemfileLock);
486        }
487    };
488
489    // OPTIONS pattern: key: value
490    let options_regex = match Regex::new(r"^ {2}([a-z]+): (.+)$") {
491        Ok(r) => r,
492        Err(e) => {
493            warn!("Failed to compile options regex: {}", e);
494            return default_package_data_with_datasource(DatasourceId::GemfileLock);
495        }
496    };
497
498    // VERSION pattern for BUNDLED WITH
499    let version_regex = match Regex::new(r"^\s+(\d+(?:\.\d+)+)\s*$") {
500        Ok(r) => r,
501        Err(e) => {
502            warn!("Failed to compile version regex: {}", e);
503            return default_package_data_with_datasource(DatasourceId::GemfileLock);
504        }
505    };
506
507    for line in content.lines() {
508        let trimmed = line.trim_end();
509
510        // Empty line resets state
511        if trimmed.is_empty() {
512            current_options.clear();
513            continue;
514        }
515
516        // Section headers (no leading whitespace) and sub-section headers
517        match trimmed {
518            "GEM" => {
519                state = ParseState::Gem;
520                current_gem_type = "GEM".to_string();
521                current_remote = None;
522                current_options.clear();
523                continue;
524            }
525            "GIT" => {
526                state = ParseState::Git;
527                current_gem_type = "GIT".to_string();
528                current_remote = None;
529                current_options.clear();
530                continue;
531            }
532            "PATH" => {
533                state = ParseState::Path;
534                current_gem_type = "PATH".to_string();
535                current_remote = None;
536                current_options.clear();
537                continue;
538            }
539            "SVN" => {
540                state = ParseState::Svn;
541                current_gem_type = "SVN".to_string();
542                current_remote = None;
543                current_options.clear();
544                continue;
545            }
546            "PLATFORMS" => {
547                state = ParseState::Platforms;
548                continue;
549            }
550            "BUNDLED WITH" => {
551                state = ParseState::BundledWith;
552                continue;
553            }
554            "DEPENDENCIES" => {
555                state = ParseState::Dependencies;
556                continue;
557            }
558            _ => {}
559        }
560
561        // Check for "  specs:" sub-section header (2-space indent) within
562        // GEM/GIT/PATH/SVN sections. This must be checked separately because
563        // the leading whitespace is preserved by trim_end().
564        if trimmed.trim() == "specs:" {
565            state = match state {
566                ParseState::Gem | ParseState::Git | ParseState::Path | ParseState::Svn => {
567                    ParseState::Specs
568                }
569                _ => state,
570            };
571            continue;
572        }
573
574        // Process based on current state
575        match state {
576            ParseState::Gem | ParseState::Git | ParseState::Path | ParseState::Svn => {
577                // Parse options (remote:, revision:, ref:, branch:, tag:)
578                if let Some(caps) = options_regex.captures(line) {
579                    let key = caps.get(1).map(|m| m.as_str()).unwrap_or("");
580                    let value = caps.get(2).map(|m| m.as_str()).unwrap_or("");
581                    current_options.insert(key.to_string(), value.to_string());
582                    if key == "remote" {
583                        current_remote = Some(value.to_string());
584                    }
585                }
586            }
587            ParseState::Specs => {
588                // Parse gem specs (4 spaces indent)
589                if let Some(caps) = spec_deps_regex.captures(line) {
590                    let name = caps.get(1).map(|m| m.as_str()).unwrap_or("").to_string();
591                    let version_str = caps.get(2).map(|m| m.as_str()).unwrap_or("");
592
593                    // Parse version and platform
594                    let (version, platform) = parse_version_platform(version_str);
595
596                    if !name.is_empty() {
597                        let gem_info = GemInfo {
598                            name: name.clone(),
599                            version,
600                            platform,
601                            gem_type: current_gem_type.clone(),
602                            remote: current_remote.clone(),
603                            revision: current_options.get("revision").cloned(),
604                            ref_field: current_options.get("ref").cloned(),
605                            branch: current_options.get("branch").cloned(),
606                            tag: current_options.get("tag").cloned(),
607                            pinned: false,
608                            requirements: Vec::new(),
609                        };
610                        gems.insert(name, gem_info);
611                    }
612                }
613            }
614            ParseState::Platforms => {
615                // Parse platform entries (2 spaces indent)
616                let platform = trimmed.trim();
617                if !platform.is_empty() {
618                    platforms.push(platform.to_string());
619                }
620            }
621            ParseState::BundledWith => {
622                // Parse bundler version
623                if let Some(caps) = version_regex.captures(line) {
624                    bundler_version = caps.get(1).map(|m| m.as_str().to_string());
625                }
626            }
627            ParseState::Dependencies => {
628                // Parse direct dependencies (2 spaces indent)
629                if let Some(caps) = deps_regex.captures(line) {
630                    let name = caps.get(1).map(|m| m.as_str()).unwrap_or("").to_string();
631                    let version_constraint = caps.get(2).map(|m| m.as_str().to_string());
632                    let pinned = caps.get(3).is_some();
633
634                    if !name.is_empty() {
635                        // Update gem info if exists, or create new
636                        if let Some(gem) = gems.get_mut(&name) {
637                            gem.pinned = pinned;
638                            if let Some(vc) = &version_constraint {
639                                gem.requirements.push(vc.clone());
640                            }
641                        } else {
642                            let gem_info = GemInfo {
643                                name: name.clone(),
644                                version: None,
645                                platform: None,
646                                gem_type: "GEM".to_string(),
647                                remote: None,
648                                revision: None,
649                                ref_field: None,
650                                branch: None,
651                                tag: None,
652                                pinned,
653                                requirements: version_constraint.into_iter().collect(),
654                            };
655                            gems.insert(name, gem_info);
656                        }
657                    }
658                }
659            }
660            ParseState::None => {}
661        }
662    }
663
664    let primary_gem = gems.values().find(|gem| gem.gem_type == "PATH").cloned();
665
666    let (
667        package_name,
668        package_version,
669        repository_homepage_url,
670        repository_download_url,
671        api_data_url,
672        download_url,
673    ) = if let Some(ref pg) = primary_gem {
674        let urls = get_rubygems_urls(&pg.name, pg.version.as_deref(), pg.platform.as_deref());
675        (
676            Some(pg.name.clone()),
677            pg.version.clone(),
678            urls.0,
679            urls.1,
680            urls.2,
681            urls.3,
682        )
683    } else {
684        (None, None, None, None, None, None)
685    };
686
687    for (_, gem) in gems {
688        if let Some(ref pg) = primary_gem
689            && gem.name == pg.name
690        {
691            continue;
692        }
693
694        let version_for_purl = gem.version.as_deref();
695        let purl = create_gem_purl(&gem.name, version_for_purl);
696
697        let extracted_requirement = if !gem.requirements.is_empty() {
698            Some(gem.requirements.join(", "))
699        } else {
700            gem.version.clone()
701        };
702
703        let extra_data = build_gem_source_extra_data(&gem);
704
705        dependencies.push(Dependency {
706            purl,
707            extracted_requirement,
708            scope: Some("dependencies".to_string()),
709            is_runtime: Some(true),
710            is_optional: Some(false),
711            is_pinned: Some(gem.pinned),
712            is_direct: Some(true),
713            resolved_package: None,
714            extra_data,
715        });
716    }
717
718    dependencies.sort_by(|left, right| {
719        left.purl
720            .as_deref()
721            .cmp(&right.purl.as_deref())
722            .then_with(|| {
723                left.extracted_requirement
724                    .as_deref()
725                    .cmp(&right.extracted_requirement.as_deref())
726            })
727    });
728
729    // Build extra_data
730    let mut extra_data = HashMap::new();
731    if !platforms.is_empty() {
732        extra_data.insert(
733            "platforms".to_string(),
734            serde_json::Value::Array(
735                platforms
736                    .into_iter()
737                    .map(serde_json::Value::String)
738                    .collect(),
739            ),
740        );
741    }
742    if let Some(bv) = bundler_version {
743        extra_data.insert("bundler_version".to_string(), serde_json::Value::String(bv));
744    }
745
746    let purl = package_name
747        .as_deref()
748        .map(|n| create_gem_purl(n, package_version.as_deref()))
749        .unwrap_or(None);
750
751    PackageData {
752        package_type: Some(PACKAGE_TYPE),
753        name: package_name,
754        version: package_version,
755        primary_language: Some("Ruby".to_string()),
756        download_url,
757        dependencies,
758        repository_homepage_url,
759        repository_download_url,
760        api_data_url,
761        extra_data: if extra_data.is_empty() {
762            None
763        } else {
764            Some(extra_data)
765        },
766        datasource_id: Some(DatasourceId::GemfileLock),
767        purl,
768        ..default_package_data()
769    }
770}
771
772fn build_gem_source_extra_data(gem: &GemInfo) -> Option<HashMap<String, serde_json::Value>> {
773    if gem.gem_type != "GIT" && gem.gem_type != "PATH" && gem.gem_type != "SVN" {
774        return None;
775    }
776
777    let mut extra = HashMap::new();
778    extra.insert(
779        "source_type".to_string(),
780        serde_json::Value::String(gem.gem_type.clone()),
781    );
782
783    if let Some(ref remote) = gem.remote {
784        extra.insert(
785            "remote".to_string(),
786            serde_json::Value::String(remote.clone()),
787        );
788    }
789    if let Some(ref revision) = gem.revision {
790        extra.insert(
791            "revision".to_string(),
792            serde_json::Value::String(revision.clone()),
793        );
794    }
795    if let Some(ref ref_field) = gem.ref_field {
796        extra.insert(
797            "ref".to_string(),
798            serde_json::Value::String(ref_field.clone()),
799        );
800    }
801    if let Some(ref branch) = gem.branch {
802        extra.insert(
803            "branch".to_string(),
804            serde_json::Value::String(branch.clone()),
805        );
806    }
807    if let Some(ref tag) = gem.tag {
808        extra.insert("tag".to_string(), serde_json::Value::String(tag.clone()));
809    }
810
811    Some(extra)
812}
813
814/// Parses version and platform from a combined string.
815/// Examples: "2.6.3" -> ("2.6.3", None), "2.6.3-java" -> ("2.6.3", Some("java"))
816fn parse_version_platform(s: &str) -> (Option<String>, Option<String>) {
817    if s.is_empty() {
818        return (None, None);
819    }
820    if let Some(idx) = s.find('-') {
821        let version = &s[..idx];
822        let platform = &s[idx + 1..];
823        (Some(version.to_string()), Some(platform.to_string()))
824    } else {
825        (Some(s.to_string()), None)
826    }
827}
828
829/// Creates a gem PURL.
830fn create_gem_purl(name: &str, version: Option<&str>) -> Option<String> {
831    let mut purl = match PackageUrl::new(PACKAGE_TYPE.as_str(), name) {
832        Ok(p) => p,
833        Err(e) => {
834            warn!("Failed to create PURL for gem '{}': {}", name, e);
835            return None;
836        }
837    };
838
839    if let Some(v) = version
840        && let Err(e) = purl.with_version(v)
841    {
842        warn!("Failed to set version '{}' for gem '{}': {}", v, name, e);
843    }
844
845    Some(purl.to_string())
846}
847
848fn rubygems_homepage_url(name: &str, version: Option<&str>) -> Option<String> {
849    if name.is_empty() {
850        return None;
851    }
852
853    if let Some(v) = version {
854        let v = v.trim().trim_matches('/');
855        Some(format!("https://rubygems.org/gems/{}/versions/{}", name, v))
856    } else {
857        Some(format!("https://rubygems.org/gems/{}", name))
858    }
859}
860
861fn rubygems_download_url(
862    name: &str,
863    version: Option<&str>,
864    platform: Option<&str>,
865) -> Option<String> {
866    if name.is_empty() || version.is_none() {
867        return None;
868    }
869
870    let name = name.trim().trim_matches('/');
871    let version = version?.trim().trim_matches('/');
872
873    let version_plat = if let Some(p) = platform {
874        if p != "ruby" {
875            format!("{}-{}", version, p)
876        } else {
877            version.to_string()
878        }
879    } else {
880        version.to_string()
881    };
882
883    Some(format!(
884        "https://rubygems.org/downloads/{}-{}.gem",
885        name, version_plat
886    ))
887}
888
889fn rubygems_api_url(name: &str, version: Option<&str>) -> Option<String> {
890    if name.is_empty() {
891        return None;
892    }
893
894    if let Some(v) = version {
895        Some(format!(
896            "https://rubygems.org/api/v2/rubygems/{}/versions/{}.json",
897            name, v
898        ))
899    } else {
900        Some(format!(
901            "https://rubygems.org/api/v1/versions/{}.json",
902            name
903        ))
904    }
905}
906
907fn get_rubygems_urls(
908    name: &str,
909    version: Option<&str>,
910    platform: Option<&str>,
911) -> (
912    Option<String>,
913    Option<String>,
914    Option<String>,
915    Option<String>,
916) {
917    let repository_homepage_url = rubygems_homepage_url(name, version);
918    let repository_download_url = rubygems_download_url(name, version, platform);
919    let api_data_url = rubygems_api_url(name, version);
920    let download_url = repository_download_url.clone();
921
922    (
923        repository_homepage_url,
924        repository_download_url,
925        api_data_url,
926        download_url,
927    )
928}
929
930/// Returns a default PackageData with gem-specific settings.
931fn default_package_data() -> PackageData {
932    PackageData {
933        package_type: Some(PACKAGE_TYPE),
934        primary_language: Some("Ruby".to_string()),
935        ..Default::default()
936    }
937}
938
939fn default_package_data_with_datasource(datasource_id: DatasourceId) -> PackageData {
940    PackageData {
941        datasource_id: Some(datasource_id),
942        ..default_package_data()
943    }
944}
945
946// =============================================================================
947// Gemspec Parser (Ruby DSL)
948// =============================================================================
949
950/// Ruby .gemspec file parser.
951///
952/// Parses `Gem::Specification.new` blocks using regex-based extraction.
953/// Handles frozen strings (Bug #1), variable version resolution (Bug #2),
954/// and RFC 5322 email parsing (Bug #6).
955pub struct GemspecParser;
956
957impl PackageParser for GemspecParser {
958    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
959
960    fn extract_packages(path: &Path) -> Vec<PackageData> {
961        let content = match fs::read_to_string(path) {
962            Ok(c) => c,
963            Err(e) => {
964                warn!("Failed to read .gemspec at {:?}: {}", path, e);
965                return vec![default_package_data_with_datasource(DatasourceId::Gemspec)];
966            }
967        };
968
969        vec![parse_gemspec_with_context(&content, path.parent())]
970    }
971
972    fn is_match(path: &Path) -> bool {
973        path.extension()
974            .and_then(|ext| ext.to_str())
975            .is_some_and(|ext| ext == "gemspec")
976    }
977}
978
979/// Cleans a value extracted from gemspec by stripping quotes, .freeze, %q{}, and brackets.
980fn clean_gemspec_value(s: &str) -> String {
981    let s = strip_freeze_suffix(s).trim();
982
983    let s = if let Some(pos) = s.find(" #") {
984        s[..pos].trim()
985    } else {
986        s
987    };
988
989    let s = if let Some(stripped) = s.strip_prefix("%q{") {
990        stripped.strip_suffix('}').unwrap_or(stripped)
991    } else if let Some(stripped) = s.strip_prefix("%q<") {
992        stripped.strip_suffix('>').unwrap_or(stripped)
993    } else if let Some(stripped) = s.strip_prefix("%q[") {
994        stripped.strip_suffix(']').unwrap_or(stripped)
995    } else if let Some(stripped) = s.strip_prefix("%q(") {
996        stripped.strip_suffix(')').unwrap_or(stripped)
997    } else {
998        s
999    };
1000
1001    let s = s
1002        .trim_start_matches('"')
1003        .trim_end_matches('"')
1004        .trim_start_matches('\'')
1005        .trim_end_matches('\'');
1006    let s = strip_freeze_suffix(s).trim();
1007    s.to_string()
1008}
1009
1010/// Extracts items from a Ruby array literal like `["a", "b", "c"]`.
1011fn extract_ruby_array(s: &str) -> Vec<String> {
1012    let s = strip_freeze_suffix(s.trim());
1013    let s = s.trim_start_matches('[').trim_end_matches(']');
1014    let item_re = match Regex::new(r#"["']([^"']*?)["'](?:\.freeze)?"#) {
1015        Ok(r) => r,
1016        Err(_) => return Vec::new(),
1017    };
1018    item_re
1019        .captures_iter(s)
1020        .filter_map(|cap| cap.get(1).map(|m| m.as_str().to_string()))
1021        .collect()
1022}
1023
1024fn extract_all_ruby_values(s: &str) -> Vec<String> {
1025    let value_re = match Regex::new(r#"%q[\{<\[(]([^\}>\])]+)[\}>\])]|["']([^"']+)["']"#) {
1026        Ok(r) => r,
1027        Err(_) => return Vec::new(),
1028    };
1029
1030    value_re
1031        .captures_iter(s)
1032        .filter_map(|caps| caps.get(1).or_else(|| caps.get(2)))
1033        .map(|m| clean_gemspec_value(m.as_str()))
1034        .collect()
1035}
1036
1037fn extract_first_ruby_value(s: &str) -> Option<String> {
1038    extract_all_ruby_values(s).into_iter().next()
1039}
1040
1041fn after_first_argument(args: &str) -> &str {
1042    let mut bracket_depth = 0usize;
1043    let mut paren_depth = 0usize;
1044    let mut in_quote: Option<char> = None;
1045    let chars: Vec<(usize, char)> = args.char_indices().collect();
1046    let mut i = 0;
1047
1048    while i < chars.len() {
1049        let (idx, ch) = chars[i];
1050
1051        if let Some(quote) = in_quote {
1052            if ch == '\\' {
1053                i += 2;
1054                continue;
1055            }
1056            if ch == quote {
1057                in_quote = None;
1058            }
1059            i += 1;
1060            continue;
1061        }
1062
1063        match ch {
1064            '\'' | '"' => in_quote = Some(ch),
1065            '[' | '{' | '<' => bracket_depth += 1,
1066            ']' | '}' | '>' => bracket_depth = bracket_depth.saturating_sub(1),
1067            '(' => paren_depth += 1,
1068            ')' => paren_depth = paren_depth.saturating_sub(1),
1069            ',' if bracket_depth == 0 && paren_depth == 0 => return args[idx + 1..].trim(),
1070            _ => {}
1071        }
1072
1073        i += 1;
1074    }
1075
1076    ""
1077}
1078
1079/// Bug #2: Resolves variable version references like `CSV::VERSION` or `RAILS_VERSION`.
1080///
1081/// Scans the file content for constant definitions matching the variable name
1082/// and returns the resolved string value.
1083fn resolve_variable_version(var_name: &str, contexts: &[String]) -> Option<String> {
1084    let var_name = var_name.trim();
1085    if var_name.is_empty() {
1086        return None;
1087    }
1088
1089    for candidate in candidate_constant_names(var_name) {
1090        let escaped = regex::escape(&candidate);
1091        let pattern = format!(r#"(?m)^\s*{}\s*=\s*["']([^"']+)["']"#, escaped);
1092        let Ok(re) = Regex::new(&pattern) else {
1093            continue;
1094        };
1095
1096        for context in contexts {
1097            if let Some(caps) = re.captures(context) {
1098                return caps.get(1).map(|m| m.as_str().to_string());
1099            }
1100        }
1101    }
1102
1103    None
1104}
1105
1106fn resolve_variable_array(var_name: &str, contexts: &[String]) -> Option<Vec<String>> {
1107    let var_name = var_name.trim();
1108    if var_name.is_empty() {
1109        return None;
1110    }
1111
1112    for candidate in candidate_constant_names(var_name) {
1113        let escaped = regex::escape(&candidate);
1114        let pattern = format!(r#"(?m)^\s*{}\s*=\s*(\[[^\n]+\])"#, escaped);
1115        let Ok(re) = Regex::new(&pattern) else {
1116            continue;
1117        };
1118
1119        for context in contexts {
1120            if let Some(caps) = re.captures(context)
1121                && let Some(raw) = caps.get(1)
1122            {
1123                let values = extract_ruby_array(raw.as_str());
1124                if !values.is_empty() {
1125                    return Some(values);
1126                }
1127            }
1128        }
1129    }
1130
1131    None
1132}
1133
1134fn candidate_constant_names(var_name: &str) -> Vec<String> {
1135    let mut names = vec![var_name.to_string()];
1136    if let Some(last) = var_name.split("::").last()
1137        && last != var_name
1138    {
1139        names.push(last.to_string());
1140    }
1141    names
1142}
1143
1144fn load_required_ruby_contexts(content: &str, base_dir: Option<&Path>) -> Vec<String> {
1145    let mut contexts = vec![content.to_string()];
1146    let Some(base_dir) = base_dir else {
1147        return contexts;
1148    };
1149
1150    let require_re = match Regex::new(r#"(?m)^\s*require(?:_relative)?\s+["']([^"']+)["']"#) {
1151        Ok(re) => re,
1152        Err(_) => return contexts,
1153    };
1154
1155    for caps in require_re.captures_iter(content) {
1156        let Some(required) = caps.get(1).map(|m| m.as_str()) else {
1157            continue;
1158        };
1159        for candidate in candidate_require_paths(base_dir, required) {
1160            if let Ok(required_content) = fs::read_to_string(&candidate) {
1161                contexts.push(required_content);
1162                break;
1163            }
1164        }
1165    }
1166
1167    contexts
1168}
1169
1170fn candidate_require_paths(base_dir: &Path, required: &str) -> Vec<PathBuf> {
1171    let relative = required.replace("::", "/");
1172    let filename = if relative.ends_with(".rb") {
1173        relative
1174    } else {
1175        format!("{}.rb", relative)
1176    };
1177
1178    vec![
1179        base_dir.join(&filename),
1180        base_dir.join("lib").join(&filename),
1181    ]
1182}
1183
1184fn looks_like_constant_reference(s: &str) -> bool {
1185    s.contains("::") || s.chars().next().is_some_and(|c| c.is_ascii_uppercase())
1186}
1187
1188/// Parses a .gemspec file content and returns PackageData.
1189#[cfg(test)]
1190fn parse_gemspec(content: &str) -> PackageData {
1191    parse_gemspec_with_context(content, None)
1192}
1193
1194fn parse_gemspec_with_context(content: &str, base_dir: Option<&Path>) -> PackageData {
1195    let contexts = load_required_ruby_contexts(content, base_dir);
1196
1197    // Regex for spec.name = "value" or s.name = "value"
1198    // The spec variable name varies: spec, s, gem, etc.
1199    let field_re = match Regex::new(
1200        r#"(?m)^\s*\w+\.(name|version|summary|description|homepage|license)\s*=\s*(.+)$"#,
1201    ) {
1202        Ok(r) => r,
1203        Err(e) => {
1204            warn!("Failed to compile gemspec field regex: {}", e);
1205            return default_package_data_with_datasource(DatasourceId::Gemspec);
1206        }
1207    };
1208
1209    let licenses_re = match Regex::new(r#"(?m)^\s*\w+\.licenses\s*=\s*(.+)$"#) {
1210        Ok(r) => r,
1211        Err(e) => {
1212            warn!("Failed to compile licenses regex: {}", e);
1213            return default_package_data_with_datasource(DatasourceId::Gemspec);
1214        }
1215    };
1216
1217    let authors_re = match Regex::new(r#"(?m)^\s*\w+\.(?:authors|author)\s*=\s*(.+)$"#) {
1218        Ok(r) => r,
1219        Err(e) => {
1220            warn!("Failed to compile authors regex: {}", e);
1221            return default_package_data_with_datasource(DatasourceId::Gemspec);
1222        }
1223    };
1224
1225    let email_re = match Regex::new(r#"(?m)^\s*\w+\.email\s*=\s*(.+)$"#) {
1226        Ok(r) => r,
1227        Err(e) => {
1228            warn!("Failed to compile email regex: {}", e);
1229            return default_package_data_with_datasource(DatasourceId::Gemspec);
1230        }
1231    };
1232
1233    let dependency_call_re = match Regex::new(
1234        r#"(?m)^\s*\w+\.(add_(?:development_|runtime_)?dependency)\s*\(?(.+?)\)?\s*$"#,
1235    ) {
1236        Ok(r) => r,
1237        Err(e) => {
1238            warn!("Failed to compile gemspec dependency regex: {}", e);
1239            return default_package_data_with_datasource(DatasourceId::Gemspec);
1240        }
1241    };
1242
1243    let mut name: Option<String> = None;
1244    let mut version: Option<String> = None;
1245    let mut summary: Option<String> = None;
1246    let mut description: Option<String> = None;
1247    let mut homepage: Option<String> = None;
1248    let mut license: Option<String> = None;
1249    let mut licenses: Vec<String> = Vec::new();
1250    let mut authors: Vec<String> = Vec::new();
1251    let mut emails: Vec<String> = Vec::new();
1252    let mut dependencies: Vec<Dependency> = Vec::new();
1253
1254    // Extract basic fields
1255    for caps in field_re.captures_iter(content) {
1256        let field_name = match caps.get(1) {
1257            Some(m) => m.as_str(),
1258            None => continue,
1259        };
1260        let raw_value = match caps.get(2) {
1261            Some(m) => m.as_str().trim(),
1262            None => continue,
1263        };
1264
1265        match field_name {
1266            "name" => {
1267                let cleaned = clean_gemspec_value(raw_value);
1268                name = if looks_like_constant_reference(&cleaned) {
1269                    resolve_variable_version(&cleaned, &contexts).or(Some(cleaned))
1270                } else {
1271                    Some(cleaned)
1272                }
1273            }
1274            "version" => {
1275                let cleaned = clean_gemspec_value(raw_value);
1276                // Bug #2: Check if version is a variable reference
1277                if looks_like_constant_reference(&cleaned) {
1278                    version = resolve_variable_version(&cleaned, &contexts).or(Some(cleaned));
1279                } else {
1280                    version = Some(cleaned);
1281                }
1282            }
1283            "summary" => {
1284                let cleaned = clean_gemspec_value(raw_value);
1285                summary = if looks_like_constant_reference(&cleaned) {
1286                    resolve_variable_version(&cleaned, &contexts).or(Some(cleaned))
1287                } else {
1288                    Some(cleaned)
1289                }
1290            }
1291            "description" => description = Some(clean_gemspec_value(raw_value)),
1292            "homepage" => {
1293                let cleaned = clean_gemspec_value(raw_value);
1294                homepage = if looks_like_constant_reference(&cleaned) {
1295                    resolve_variable_version(&cleaned, &contexts).or(Some(cleaned))
1296                } else {
1297                    Some(cleaned)
1298                }
1299            }
1300            "license" => license = Some(clean_gemspec_value(raw_value)),
1301            _ => {}
1302        }
1303    }
1304
1305    // Extract licenses (plural)
1306    for caps in licenses_re.captures_iter(content) {
1307        if let Some(raw) = caps.get(1) {
1308            licenses = extract_ruby_array(raw.as_str());
1309        }
1310    }
1311
1312    // Extract authors
1313    for caps in authors_re.captures_iter(content) {
1314        if let Some(raw) = caps.get(1) {
1315            let raw_str = raw.as_str().trim();
1316            if raw_str.starts_with('[') {
1317                authors = extract_ruby_array(raw_str);
1318            } else if looks_like_constant_reference(raw_str) {
1319                authors = resolve_variable_array(raw_str, &contexts)
1320                    .unwrap_or_else(|| vec![clean_gemspec_value(raw_str)]);
1321            } else {
1322                authors.push(clean_gemspec_value(raw_str));
1323            }
1324        }
1325    }
1326
1327    // Extract emails
1328    for caps in email_re.captures_iter(content) {
1329        if let Some(raw) = caps.get(1) {
1330            let raw_str = raw.as_str().trim();
1331            if raw_str.starts_with('[') {
1332                emails = extract_ruby_array(raw_str);
1333            } else if looks_like_constant_reference(raw_str) {
1334                emails = resolve_variable_array(raw_str, &contexts)
1335                    .unwrap_or_else(|| vec![clean_gemspec_value(raw_str)]);
1336            } else {
1337                emails.push(clean_gemspec_value(raw_str));
1338            }
1339        }
1340    }
1341
1342    // Build parties from authors and emails
1343    let mut parties: Vec<Party> = Vec::new();
1344
1345    if authors.len() == 1 && emails.len() == 1 {
1346        let email_str = emails.first().map(String::as_str);
1347        let (parsed_email_name, parsed_email) = match email_str {
1348            Some(e) => split_name_email(e),
1349            None => (None, None),
1350        };
1351
1352        parties.push(Party {
1353            r#type: Some("person".to_string()),
1354            role: Some("author".to_string()),
1355            name: authors.first().cloned().or(parsed_email_name),
1356            email: parsed_email.or_else(|| {
1357                email_str
1358                    .filter(|e| e.contains('@') && !e.contains('<'))
1359                    .map(|e| e.to_string())
1360            }),
1361            url: None,
1362            organization: None,
1363            organization_url: None,
1364            timezone: None,
1365        });
1366    } else {
1367        for author_name in authors {
1368            parties.push(Party {
1369                r#type: Some("person".to_string()),
1370                role: Some("author".to_string()),
1371                name: Some(author_name),
1372                email: None,
1373                url: None,
1374                organization: None,
1375                organization_url: None,
1376                timezone: None,
1377            });
1378        }
1379
1380        for email_str in emails {
1381            let (parsed_email_name, parsed_email) = if email_str.contains('<') {
1382                split_name_email(&email_str)
1383            } else {
1384                (None, None)
1385            };
1386            parties.push(Party {
1387                r#type: Some("person".to_string()),
1388                role: Some("author".to_string()),
1389                name: parsed_email_name,
1390                email: parsed_email.or_else(|| email_str.contains('@').then_some(email_str)),
1391                url: None,
1392                organization: None,
1393                organization_url: None,
1394                timezone: None,
1395            });
1396        }
1397    }
1398
1399    for caps in dependency_call_re.captures_iter(content) {
1400        let method = match caps.get(1) {
1401            Some(m) => m.as_str(),
1402            None => continue,
1403        };
1404        let args = match caps.get(2) {
1405            Some(m) => m.as_str(),
1406            None => continue,
1407        };
1408
1409        let Some(dep_name) = extract_first_ruby_value(args) else {
1410            continue;
1411        };
1412        let version_parts = extract_all_ruby_values(after_first_argument(args));
1413        let extracted_requirement = if version_parts.is_empty() {
1414            None
1415        } else {
1416            Some(version_parts.join(", "))
1417        };
1418        let purl = create_gem_purl(&dep_name, None);
1419        let is_development = method == "add_development_dependency";
1420        let scope = if is_development {
1421            "development"
1422        } else {
1423            "runtime"
1424        };
1425
1426        dependencies.push(Dependency {
1427            purl,
1428            extracted_requirement,
1429            scope: Some(scope.to_string()),
1430            is_runtime: Some(!is_development),
1431            is_optional: Some(is_development),
1432            is_pinned: None,
1433            is_direct: Some(true),
1434            resolved_package: None,
1435            extra_data: None,
1436        });
1437    }
1438
1439    // Extract license statement only - detection happens in separate engine
1440    let extracted_license_statement = if !licenses.is_empty() {
1441        Some(licenses.join(" AND "))
1442    } else {
1443        license
1444    };
1445
1446    let declared_license_expression = None;
1447    let declared_license_expression_spdx = None;
1448
1449    // Prefer description over summary
1450    let final_description = description.or(summary);
1451
1452    // Build PURL
1453    let purl = name
1454        .as_deref()
1455        .map(|n| create_gem_purl(n, version.as_deref()))
1456        .unwrap_or(None);
1457
1458    let (repository_homepage_url, repository_download_url, api_data_url, download_url) =
1459        if let Some(n) = name.as_deref() {
1460            get_rubygems_urls(n, version.as_deref(), None)
1461        } else {
1462            (None, None, None, None)
1463        };
1464
1465    PackageData {
1466        package_type: Some(PACKAGE_TYPE),
1467        name,
1468        version,
1469        primary_language: Some("Ruby".to_string()),
1470        description: final_description,
1471        homepage_url: homepage,
1472        download_url,
1473        declared_license_expression,
1474        declared_license_expression_spdx,
1475        extracted_license_statement,
1476        parties,
1477        dependencies,
1478        repository_homepage_url,
1479        repository_download_url,
1480        api_data_url,
1481        datasource_id: Some(DatasourceId::Gemspec),
1482        purl,
1483        ..default_package_data()
1484    }
1485}
1486
1487// =============================================================================
1488// .gem Archive Parser (Wave 3)
1489// =============================================================================
1490
1491const MAX_ARCHIVE_SIZE: u64 = 100 * 1024 * 1024; // 100MB
1492const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024; // 50MB per file
1493const MAX_COMPRESSION_RATIO: f64 = 100.0; // 100:1 ratio
1494
1495/// Parser for .gem archive files.
1496///
1497/// Extracts metadata from Ruby .gem packages, which are tar archives
1498/// containing a gzip-compressed YAML metadata file (`metadata.gz`).
1499///
1500/// Includes safety checks against zip bombs and oversized archives.
1501pub struct GemArchiveParser;
1502
1503impl PackageParser for GemArchiveParser {
1504    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
1505
1506    fn extract_packages(path: &Path) -> Vec<PackageData> {
1507        vec![match extract_gem_archive(path) {
1508            Ok(data) => data,
1509            Err(e) => {
1510                warn!("Failed to extract .gem archive at {:?}: {}", path, e);
1511                default_package_data_with_datasource(DatasourceId::GemArchive)
1512            }
1513        }]
1514    }
1515
1516    fn is_match(path: &Path) -> bool {
1517        path.extension()
1518            .and_then(|ext| ext.to_str())
1519            .is_some_and(|ext| ext == "gem")
1520    }
1521}
1522
1523fn extract_gem_archive(path: &Path) -> Result<PackageData, String> {
1524    let file_metadata =
1525        fs::metadata(path).map_err(|e| format!("Failed to read file metadata: {}", e))?;
1526    let archive_size = file_metadata.len();
1527
1528    if archive_size > MAX_ARCHIVE_SIZE {
1529        return Err(format!(
1530            "Archive too large: {} bytes (limit: {} bytes)",
1531            archive_size, MAX_ARCHIVE_SIZE
1532        ));
1533    }
1534
1535    let file = File::open(path).map_err(|e| format!("Failed to open archive: {}", e))?;
1536    let mut archive = Archive::new(file);
1537
1538    for entry_result in archive
1539        .entries()
1540        .map_err(|e| format!("Failed to read tar entries: {}", e))?
1541    {
1542        let entry = entry_result.map_err(|e| format!("Failed to read tar entry: {}", e))?;
1543        let entry_path = entry
1544            .path()
1545            .map_err(|e| format!("Failed to get entry path: {}", e))?;
1546
1547        if entry_path.to_str() == Some("metadata.gz") {
1548            let entry_size = entry.size();
1549            if entry_size > MAX_FILE_SIZE {
1550                return Err(format!(
1551                    "metadata.gz too large: {} bytes (limit: {} bytes)",
1552                    entry_size, MAX_FILE_SIZE
1553                ));
1554            }
1555
1556            let mut decoder = GzDecoder::new(entry);
1557            let mut content = String::new();
1558            decoder
1559                .read_to_string(&mut content)
1560                .map_err(|e| format!("Failed to decompress metadata.gz: {}", e))?;
1561
1562            let uncompressed_size = content.len() as u64;
1563            if entry_size > 0 {
1564                let ratio = uncompressed_size as f64 / entry_size as f64;
1565                if ratio > MAX_COMPRESSION_RATIO {
1566                    return Err(format!(
1567                        "Suspicious compression ratio: {:.2}:1 (limit: {:.0}:1)",
1568                        ratio, MAX_COMPRESSION_RATIO
1569                    ));
1570                }
1571            }
1572            if uncompressed_size > MAX_FILE_SIZE {
1573                return Err(format!(
1574                    "Decompressed metadata too large: {} bytes (limit: {} bytes)",
1575                    uncompressed_size, MAX_FILE_SIZE
1576                ));
1577            }
1578
1579            return parse_gem_metadata_yaml(&content, DatasourceId::GemArchive);
1580        }
1581    }
1582
1583    Err("metadata.gz not found in .gem archive".to_string())
1584}
1585
1586fn parse_gem_metadata_yaml(
1587    content: &str,
1588    datasource_id: DatasourceId,
1589) -> Result<PackageData, String> {
1590    // Ruby YAML tagged types need to be handled:
1591    // --- !ruby/object:Gem::Specification
1592    // We strip Ruby-specific YAML tags since serde_yaml can't handle them
1593    let cleaned = clean_ruby_yaml_tags(content);
1594
1595    let yaml: serde_yaml::Value =
1596        serde_yaml::from_str(&cleaned).map_err(|e| format!("Failed to parse YAML: {}", e))?;
1597
1598    let name = yaml_string(&yaml, "name");
1599    let version = yaml.get("version").and_then(|v| {
1600        // version can be a simple string or a mapping with a "version" key
1601        if v.is_string() {
1602            v.as_str().map(|s| s.to_string())
1603        } else {
1604            yaml_string(v, "version")
1605        }
1606    });
1607    let description = yaml_string(&yaml, "description").or_else(|| yaml_string(&yaml, "summary"));
1608    let homepage = yaml_string(&yaml, "homepage");
1609    let summary = yaml_string(&yaml, "summary");
1610
1611    // Licenses
1612    let licenses: Vec<String> = yaml
1613        .get("licenses")
1614        .and_then(|v| v.as_sequence())
1615        .map(|seq| {
1616            seq.iter()
1617                .filter_map(|item| item.as_str().map(|s| s.to_string()))
1618                .collect()
1619        })
1620        .unwrap_or_default();
1621
1622    // Extract license statement only - detection happens in separate engine
1623    let extracted_license_statement = if !licenses.is_empty() {
1624        Some(licenses.join(" AND "))
1625    } else {
1626        None
1627    };
1628
1629    let license_expression = None;
1630    let license_expression_spdx = None;
1631
1632    // Authors
1633    let authors: Vec<String> = yaml
1634        .get("authors")
1635        .and_then(|v| v.as_sequence())
1636        .map(|seq| {
1637            seq.iter()
1638                .filter_map(|item| item.as_str().map(|s| s.to_string()))
1639                .collect()
1640        })
1641        .unwrap_or_default();
1642
1643    let emails: Vec<String> = yaml
1644        .get("email")
1645        .map(|v| {
1646            if let Some(seq) = v.as_sequence() {
1647                seq.iter()
1648                    .filter_map(|item| item.as_str().map(|s| s.to_string()))
1649                    .collect()
1650            } else if let Some(s) = v.as_str() {
1651                vec![s.to_string()]
1652            } else {
1653                Vec::new()
1654            }
1655        })
1656        .unwrap_or_default();
1657
1658    // Build parties
1659    let mut parties: Vec<Party> = Vec::new();
1660    let max_len = authors.len().max(emails.len());
1661    for i in 0..max_len {
1662        let author_name = authors.get(i).map(|s| s.as_str());
1663        let email_str = emails.get(i).map(|s| s.as_str());
1664
1665        let (parsed_email_name, parsed_email) = match email_str {
1666            Some(e) if e.contains('<') => split_name_email(e),
1667            None => (None, None),
1668            _ => (None, None),
1669        };
1670
1671        let party_name = author_name.map(|s| s.to_string()).or(parsed_email_name);
1672
1673        parties.push(Party {
1674            r#type: Some("person".to_string()),
1675            role: Some("author".to_string()),
1676            name: party_name,
1677            email: parsed_email.or_else(|| {
1678                email_str
1679                    .filter(|e| e.contains('@') && !e.contains('<'))
1680                    .map(|e| e.to_string())
1681            }),
1682            url: None,
1683            organization: None,
1684            organization_url: None,
1685            timezone: None,
1686        });
1687    }
1688
1689    // Dependencies
1690    let dependencies = parse_gem_yaml_dependencies(&yaml);
1691
1692    let metadata = yaml.get("metadata");
1693
1694    let bug_tracking_url = metadata.and_then(|m| yaml_string(m, "bug_tracking_uri"));
1695
1696    let code_view_url = metadata.and_then(|m| yaml_string(m, "source_code_uri"));
1697
1698    let vcs_url = code_view_url
1699        .clone()
1700        .or_else(|| metadata.and_then(|m| yaml_string(m, "homepage_uri")));
1701
1702    let file_references = metadata
1703        .and_then(|m| m.get("files"))
1704        .and_then(|f| f.as_sequence())
1705        .map(|seq| {
1706            seq.iter()
1707                .filter_map(|v| v.as_str())
1708                .map(|s| crate::models::FileReference {
1709                    path: s.to_string(),
1710                    size: None,
1711                    sha1: None,
1712                    md5: None,
1713                    sha256: None,
1714                    sha512: None,
1715                    extra_data: None,
1716                })
1717                .collect::<Vec<_>>()
1718        })
1719        .unwrap_or_default();
1720
1721    let release_date = yaml_string(&yaml, "date").and_then(|d| {
1722        if d.len() >= 10 {
1723            Some(d[..10].to_string())
1724        } else {
1725            None
1726        }
1727    });
1728
1729    let purl = name
1730        .as_deref()
1731        .map(|n| create_gem_purl(n, version.as_deref()))
1732        .unwrap_or(None);
1733
1734    let platform = yaml_string(&yaml, "platform");
1735    let (repository_homepage_url, repository_download_url, api_data_url, download_url) =
1736        if let Some(n) = name.as_deref() {
1737            get_rubygems_urls(n, version.as_deref(), platform.as_deref())
1738        } else {
1739            (None, None, None, None)
1740        };
1741
1742    let qualifiers = if let Some(ref p) = platform {
1743        if p != "ruby" {
1744            let mut q = HashMap::new();
1745            q.insert("platform".to_string(), p.clone());
1746            Some(q)
1747        } else {
1748            None
1749        }
1750    } else {
1751        None
1752    };
1753
1754    Ok(PackageData {
1755        package_type: Some(PACKAGE_TYPE),
1756        name,
1757        version,
1758        qualifiers,
1759        primary_language: Some("Ruby".to_string()),
1760        description: description.or(summary),
1761        release_date,
1762        homepage_url: homepage,
1763        download_url,
1764        bug_tracking_url,
1765        code_view_url,
1766        declared_license_expression: license_expression,
1767        declared_license_expression_spdx: license_expression_spdx,
1768        extracted_license_statement,
1769        file_references,
1770        parties,
1771        dependencies,
1772        repository_homepage_url,
1773        repository_download_url,
1774        api_data_url,
1775        datasource_id: Some(datasource_id),
1776        purl,
1777        vcs_url,
1778        ..default_package_data()
1779    })
1780}
1781
1782/// Strips Ruby-specific YAML tags that serde_yaml cannot handle.
1783fn clean_ruby_yaml_tags(content: &str) -> String {
1784    let tag_re = match Regex::new(r"!ruby/\S+") {
1785        Ok(r) => r,
1786        Err(_) => return content.to_string(),
1787    };
1788    tag_re.replace_all(content, "").to_string()
1789}
1790
1791fn yaml_string(yaml: &serde_yaml::Value, key: &str) -> Option<String> {
1792    yaml.get(key)
1793        .and_then(|v| v.as_str())
1794        .filter(|s| !s.is_empty())
1795        .map(|s| s.to_string())
1796}
1797
1798fn parse_gem_yaml_dependencies(yaml: &serde_yaml::Value) -> Vec<Dependency> {
1799    let mut dependencies = Vec::new();
1800
1801    let deps_seq = match yaml.get("dependencies").and_then(|v| v.as_sequence()) {
1802        Some(seq) => seq,
1803        None => return dependencies,
1804    };
1805
1806    for dep_value in deps_seq {
1807        let dep_name = match yaml_string(dep_value, "name") {
1808            Some(n) => n,
1809            None => continue,
1810        };
1811
1812        let dep_type = yaml_string(dep_value, "type");
1813        let is_development = dep_type.as_deref() == Some(":development");
1814
1815        // Extract version requirements from the nested structure
1816        let requirements = dep_value
1817            .get("requirement")
1818            .or_else(|| dep_value.get("version_requirements"))
1819            .and_then(|req| req.get("requirements"))
1820            .and_then(|reqs| reqs.as_sequence());
1821
1822        let extracted_requirement = requirements.map(|reqs| {
1823            let parts: Vec<String> = reqs
1824                .iter()
1825                .filter_map(|req| {
1826                    let seq = req.as_sequence()?;
1827                    if seq.len() >= 2 {
1828                        let op = seq[0].as_str().unwrap_or("");
1829                        let ver = seq[1].get("version").and_then(|v| v.as_str()).unwrap_or("");
1830                        if op == ">=" && ver == "0" {
1831                            // ">= 0" means "any version" - skip
1832                            None
1833                        } else if op.is_empty() || ver.is_empty() {
1834                            None
1835                        } else {
1836                            Some(format!("{} {}", op, ver))
1837                        }
1838                    } else {
1839                        None
1840                    }
1841                })
1842                .collect();
1843            parts.join(", ")
1844        });
1845
1846        let extracted_requirement = extracted_requirement
1847            .filter(|s| !s.is_empty())
1848            .or_else(|| Some(String::new()));
1849
1850        let (scope, is_runtime, is_optional) = if is_development {
1851            (Some("development".to_string()), false, true)
1852        } else {
1853            (Some("runtime".to_string()), true, false)
1854        };
1855
1856        let purl = create_gem_purl(&dep_name, None);
1857
1858        dependencies.push(Dependency {
1859            purl,
1860            extracted_requirement,
1861            scope,
1862            is_runtime: Some(is_runtime),
1863            is_optional: Some(is_optional),
1864            is_pinned: None,
1865            is_direct: Some(true),
1866            resolved_package: None,
1867            extra_data: None,
1868        });
1869    }
1870
1871    dependencies
1872}
1873
1874// =============================================================================
1875// Gem Metadata Extracted Parser (metadata.gz-extract files)
1876// =============================================================================
1877
1878pub struct GemMetadataExtractedParser;
1879
1880impl PackageParser for GemMetadataExtractedParser {
1881    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
1882
1883    fn extract_packages(path: &Path) -> Vec<PackageData> {
1884        vec![match extract_gem_metadata_extracted(path) {
1885            Ok(data) => data,
1886            Err(e) => {
1887                warn!("Failed to extract gem metadata from {:?}: {}", path, e);
1888                default_package_data_with_datasource(DatasourceId::GemArchiveExtracted)
1889            }
1890        }]
1891    }
1892
1893    fn is_match(path: &Path) -> bool {
1894        path.to_str()
1895            .is_some_and(|p| p.contains("metadata.gz-extract"))
1896    }
1897}
1898
1899fn extract_gem_metadata_extracted(path: &Path) -> Result<PackageData, String> {
1900    let content = fs::read_to_string(path)
1901        .map_err(|e| format!("Failed to read metadata.gz-extract file: {}", e))?;
1902
1903    parse_gem_metadata_yaml(&content, DatasourceId::GemArchiveExtracted)
1904}
1905
1906// Register parser with metadata
1907crate::register_parser!(
1908    "Ruby Gemfile manifest",
1909    &["**/Gemfile", "**/data.gz-extract/Gemfile"],
1910    "gem",
1911    "Ruby",
1912    Some("https://bundler.io/man/gemfile.5.html"),
1913);
1914
1915crate::register_parser!(
1916    "Ruby Gemfile.lock lockfile",
1917    &["**/Gemfile.lock", "**/data.gz-extract/Gemfile.lock"],
1918    "gem",
1919    "Ruby",
1920    Some("https://bundler.io/man/gemfile.5.html"),
1921);
1922
1923crate::register_parser!(
1924    "Ruby .gemspec manifest",
1925    &[
1926        "**/*.gemspec",
1927        "**/data.gz-extract/*.gemspec",
1928        "**/specifications/*.gemspec"
1929    ],
1930    "gem",
1931    "Ruby",
1932    Some("https://guides.rubygems.org/specification-reference/"),
1933);
1934
1935crate::register_parser!(
1936    "Ruby .gem archive",
1937    &["**/*.gem"],
1938    "gem",
1939    "Ruby",
1940    Some("https://guides.rubygems.org/specification-reference/"),
1941);
1942
1943crate::register_parser!(
1944    "Ruby gem metadata (extracted)",
1945    &["**/metadata.gz-extract"],
1946    "gem",
1947    "Ruby",
1948    Some("https://guides.rubygems.org/specification-reference/"),
1949);
1950
1951#[cfg(test)]
1952mod tests {
1953    use super::parse_gemspec;
1954
1955    #[test]
1956    fn test_clean_gemspec_value_handles_unterminated_percent_q() {
1957        assert_eq!(
1958            super::clean_gemspec_value("%q{Arel is a SQL AST manager for Ruby. It"),
1959            "Arel is a SQL AST manager for Ruby. It"
1960        );
1961    }
1962
1963    #[test]
1964    fn test_parse_gemspec_runtime_dependency_scope() {
1965        let content = r#"
1966Gem::Specification.new do |spec|
1967  spec.name = "demo"
1968  spec.version = "1.0.0"
1969  spec.add_runtime_dependency "rack", "~> 3.0"
1970  spec.add_dependency "thor", ">= 1.0"
1971end
1972"#;
1973
1974        let package_data = parse_gemspec(content);
1975        assert_eq!(package_data.dependencies.len(), 2);
1976        assert_eq!(
1977            package_data.dependencies[0].scope,
1978            Some("runtime".to_string())
1979        );
1980        assert_eq!(
1981            package_data.dependencies[0].extracted_requirement,
1982            Some("~> 3.0".to_string())
1983        );
1984        assert_eq!(
1985            package_data.dependencies[1].scope,
1986            Some("runtime".to_string())
1987        );
1988        assert_eq!(
1989            package_data.dependencies[1].extracted_requirement,
1990            Some(">= 1.0".to_string())
1991        );
1992    }
1993}