Skip to main content

provenant/parsers/
debian.rs

1//! Parser for Debian package metadata files.
2//!
3//! Extracts package metadata from Debian package management files using RFC 822
4//! format parsing for control files and installed package databases.
5//!
6//! # Supported Formats
7//! - `debian/control` (Source package control files - multi-paragraph)
8//! - `/var/lib/dpkg/status` (Installed package database - multi-paragraph)
9//! - `/var/lib/dpkg/status.d/*` (Distroless installed packages)
10//! - `*.dsc` (Debian source control files)
11//! - `*.orig.tar.*` (Original upstream tarballs)
12//! - `*.debian.tar.*` (Debian packaging tarballs)
13//! - `/var/lib/dpkg/info/*.list` (Installed file lists)
14//! - `/var/lib/dpkg/info/*.md5sums` (Installed file checksums)
15//! - `debian/copyright` (Copyright/license declarations)
16//! - `*.deb` (Debian binary package archives)
17//! - `control` (extracted from .deb archives)
18//! - `md5sums` (extracted from .deb archives)
19//!
20//! # Key Features
21//! - RFC 822 format parsing for control files
22//! - Dependency extraction with scope tracking (Depends, Build-Depends, etc.)
23//! - Debian vs Ubuntu namespace detection from version and maintainer fields
24//! - Multi-paragraph record parsing for package databases
25//! - License and copyright information extraction
26//! - Package URL (purl) generation with namespace
27//!
28//! # Implementation Notes
29//! - Uses RFC 822 parser from `crate::parsers::rfc822` module
30//! - Multi-paragraph records separated by blank lines
31//! - Graceful error handling with `warn!()` logs
32
33use std::collections::HashMap;
34use std::path::Path;
35
36use log::warn;
37use packageurl::PackageUrl;
38use regex::Regex;
39
40use crate::models::{
41    DatasourceId, Dependency, FileReference, LicenseDetection, PackageData, PackageType, Party,
42};
43use crate::parsers::rfc822::{self, Rfc822Metadata};
44use crate::parsers::utils::{read_file_to_string, split_name_email};
45use crate::utils::spdx::combine_license_expressions;
46
47use super::PackageParser;
48use super::license_normalization::{
49    DeclaredLicenseMatchMetadata, NormalizedDeclaredLicense, build_declared_license_detection,
50    normalize_declared_license_key,
51};
52
53const PACKAGE_TYPE: PackageType = PackageType::Deb;
54
55fn default_package_data(datasource_id: DatasourceId) -> PackageData {
56    PackageData {
57        package_type: Some(PACKAGE_TYPE),
58        datasource_id: Some(datasource_id),
59        ..Default::default()
60    }
61}
62
63// Namespace detection clues from version strings
64const VERSION_CLUES_DEBIAN: &[&str] = &["deb"];
65const VERSION_CLUES_UBUNTU: &[&str] = &["ubuntu"];
66
67// Namespace detection clues from maintainer fields
68const MAINTAINER_CLUES_DEBIAN: &[&str] = &[
69    "packages.debian.org",
70    "lists.debian.org",
71    "lists.alioth.debian.org",
72    "@debian.org",
73    "debian-init-diversity@",
74];
75const MAINTAINER_CLUES_UBUNTU: &[&str] = &["lists.ubuntu.com", "@canonical.com"];
76
77// Dependency field names and their scope/flags
78struct DepFieldSpec {
79    field: &'static str,
80    scope: &'static str,
81    is_runtime: bool,
82    is_optional: bool,
83}
84
85const DEP_FIELDS: &[DepFieldSpec] = &[
86    DepFieldSpec {
87        field: "depends",
88        scope: "depends",
89        is_runtime: true,
90        is_optional: false,
91    },
92    DepFieldSpec {
93        field: "pre-depends",
94        scope: "pre-depends",
95        is_runtime: true,
96        is_optional: false,
97    },
98    DepFieldSpec {
99        field: "recommends",
100        scope: "recommends",
101        is_runtime: true,
102        is_optional: true,
103    },
104    DepFieldSpec {
105        field: "suggests",
106        scope: "suggests",
107        is_runtime: true,
108        is_optional: true,
109    },
110    DepFieldSpec {
111        field: "breaks",
112        scope: "breaks",
113        is_runtime: false,
114        is_optional: false,
115    },
116    DepFieldSpec {
117        field: "conflicts",
118        scope: "conflicts",
119        is_runtime: false,
120        is_optional: false,
121    },
122    DepFieldSpec {
123        field: "replaces",
124        scope: "replaces",
125        is_runtime: false,
126        is_optional: false,
127    },
128    DepFieldSpec {
129        field: "provides",
130        scope: "provides",
131        is_runtime: false,
132        is_optional: false,
133    },
134    DepFieldSpec {
135        field: "build-depends",
136        scope: "build-depends",
137        is_runtime: false,
138        is_optional: false,
139    },
140    DepFieldSpec {
141        field: "build-depends-indep",
142        scope: "build-depends-indep",
143        is_runtime: false,
144        is_optional: false,
145    },
146    DepFieldSpec {
147        field: "build-conflicts",
148        scope: "build-conflicts",
149        is_runtime: false,
150        is_optional: false,
151    },
152];
153
154// ---------------------------------------------------------------------------
155// DebianControlParser: debian/control files (source + binary paragraphs)
156// ---------------------------------------------------------------------------
157
158pub struct DebianControlParser;
159
160impl PackageParser for DebianControlParser {
161    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
162
163    fn is_match(path: &Path) -> bool {
164        if let Some(name) = path.file_name()
165            && name == "control"
166            && let Some(parent) = path.parent()
167            && let Some(parent_name) = parent.file_name()
168        {
169            return parent_name == "debian";
170        }
171        false
172    }
173
174    fn extract_packages(path: &Path) -> Vec<PackageData> {
175        let content = match read_file_to_string(path) {
176            Ok(c) => c,
177            Err(e) => {
178                warn!("Failed to read debian/control at {:?}: {}", path, e);
179                return Vec::new();
180            }
181        };
182
183        parse_debian_control(&content)
184    }
185}
186
187// ---------------------------------------------------------------------------
188// DebianInstalledParser: /var/lib/dpkg/status
189// ---------------------------------------------------------------------------
190
191pub struct DebianInstalledParser;
192
193impl PackageParser for DebianInstalledParser {
194    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
195
196    fn is_match(path: &Path) -> bool {
197        let path_str = path.to_string_lossy();
198        path_str.ends_with("var/lib/dpkg/status")
199    }
200
201    fn extract_packages(path: &Path) -> Vec<PackageData> {
202        let content = match read_file_to_string(path) {
203            Ok(c) => c,
204            Err(e) => {
205                warn!("Failed to read dpkg/status at {:?}: {}", path, e);
206                return Vec::new();
207            }
208        };
209
210        parse_dpkg_status(&content)
211    }
212}
213
214pub struct DebianDistrolessInstalledParser;
215
216impl PackageParser for DebianDistrolessInstalledParser {
217    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
218
219    fn is_match(path: &Path) -> bool {
220        let path_str = path.to_string_lossy();
221        path_str.contains("var/lib/dpkg/status.d/")
222    }
223
224    fn extract_packages(path: &Path) -> Vec<PackageData> {
225        let content = match read_file_to_string(path) {
226            Ok(c) => c,
227            Err(e) => {
228                warn!("Failed to read distroless status file at {:?}: {}", path, e);
229                return vec![default_package_data(
230                    DatasourceId::DebianDistrolessInstalledDb,
231                )];
232            }
233        };
234
235        vec![parse_distroless_status(&content)]
236    }
237}
238
239fn parse_distroless_status(content: &str) -> PackageData {
240    let paragraphs = rfc822::parse_rfc822_paragraphs(content);
241
242    if paragraphs.is_empty() {
243        return default_package_data(DatasourceId::DebianDistrolessInstalledDb);
244    }
245
246    build_package_from_paragraph(
247        &paragraphs[0],
248        None,
249        DatasourceId::DebianDistrolessInstalledDb,
250    )
251    .unwrap_or_else(|| default_package_data(DatasourceId::DebianDistrolessInstalledDb))
252}
253
254// ---------------------------------------------------------------------------
255// Parsing logic
256// ---------------------------------------------------------------------------
257
258/// Parses a debian/control file into PackageData entries.
259///
260/// A debian/control file has a Source paragraph followed by one or more Binary
261/// paragraphs. Source-level metadata (maintainer, homepage, VCS URLs) is merged
262/// into each binary package.
263fn parse_debian_control(content: &str) -> Vec<PackageData> {
264    let paragraphs = rfc822::parse_rfc822_paragraphs(content);
265    if paragraphs.is_empty() {
266        return Vec::new();
267    }
268
269    // Determine if first paragraph is a Source paragraph
270    let has_source = rfc822::get_header_first(&paragraphs[0].headers, "source").is_some();
271
272    let (source_paragraph, binary_start) = if has_source {
273        (Some(&paragraphs[0]), 1)
274    } else {
275        (None, 0)
276    };
277
278    // Extract source-level shared metadata
279    let source_meta = source_paragraph.map(extract_source_meta);
280
281    let mut packages = Vec::new();
282
283    for para in &paragraphs[binary_start..] {
284        if let Some(pkg) = build_package_from_paragraph(
285            para,
286            source_meta.as_ref(),
287            DatasourceId::DebianControlInSource,
288        ) {
289            packages.push(pkg);
290        }
291    }
292
293    if packages.is_empty()
294        && let Some(source_para) = source_paragraph
295        && let Some(pkg) = build_package_from_source_paragraph(source_para)
296    {
297        packages.push(pkg);
298    }
299
300    packages
301}
302
303/// Parses a dpkg/status file into PackageData entries.
304///
305/// Each paragraph represents an installed package. Only packages with
306/// `Status: install ok installed` are included.
307fn parse_dpkg_status(content: &str) -> Vec<PackageData> {
308    let paragraphs = rfc822::parse_rfc822_paragraphs(content);
309    let mut packages = Vec::new();
310
311    for para in &paragraphs {
312        let status = rfc822::get_header_first(&para.headers, "status");
313        if status.as_deref() != Some("install ok installed") {
314            continue;
315        }
316
317        if let Some(pkg) =
318            build_package_from_paragraph(para, None, DatasourceId::DebianInstalledStatusDb)
319        {
320            packages.push(pkg);
321        }
322    }
323
324    packages
325}
326
327// ---------------------------------------------------------------------------
328// Source paragraph metadata (shared across binary packages)
329// ---------------------------------------------------------------------------
330
331struct SourceMeta {
332    parties: Vec<Party>,
333    homepage_url: Option<String>,
334    vcs_url: Option<String>,
335    code_view_url: Option<String>,
336    bug_tracking_url: Option<String>,
337}
338
339fn extract_source_meta(paragraph: &Rfc822Metadata) -> SourceMeta {
340    let mut parties = Vec::new();
341
342    // Maintainer
343    if let Some(maintainer) = rfc822::get_header_first(&paragraph.headers, "maintainer") {
344        let (name, email) = split_name_email(&maintainer);
345        parties.push(Party {
346            r#type: Some("person".to_string()),
347            role: Some("maintainer".to_string()),
348            name,
349            email,
350            url: None,
351            organization: None,
352            organization_url: None,
353            timezone: None,
354        });
355    }
356
357    // Original-Maintainer
358    if let Some(orig_maintainer) =
359        rfc822::get_header_first(&paragraph.headers, "original-maintainer")
360    {
361        let (name, email) = split_name_email(&orig_maintainer);
362        parties.push(Party {
363            r#type: Some("person".to_string()),
364            role: Some("maintainer".to_string()),
365            name,
366            email,
367            url: None,
368            organization: None,
369            organization_url: None,
370            timezone: None,
371        });
372    }
373
374    // Uploaders (comma-separated)
375    if let Some(uploaders_str) = rfc822::get_header_first(&paragraph.headers, "uploaders") {
376        for uploader in uploaders_str.split(',') {
377            let trimmed = uploader.trim();
378            if !trimmed.is_empty() {
379                let (name, email) = split_name_email(trimmed);
380                parties.push(Party {
381                    r#type: Some("person".to_string()),
382                    role: Some("uploader".to_string()),
383                    name,
384                    email,
385                    url: None,
386                    organization: None,
387                    organization_url: None,
388                    timezone: None,
389                });
390            }
391        }
392    }
393
394    let homepage_url = rfc822::get_header_first(&paragraph.headers, "homepage");
395
396    // VCS-Git: may contain branch info after space
397    let vcs_url = rfc822::get_header_first(&paragraph.headers, "vcs-git")
398        .map(|url| url.split_whitespace().next().unwrap_or(&url).to_string());
399
400    let code_view_url = rfc822::get_header_first(&paragraph.headers, "vcs-browser");
401
402    let bug_tracking_url = rfc822::get_header_first(&paragraph.headers, "bugs");
403
404    SourceMeta {
405        parties,
406        homepage_url,
407        vcs_url,
408        code_view_url,
409        bug_tracking_url,
410    }
411}
412
413// ---------------------------------------------------------------------------
414// Package building
415// ---------------------------------------------------------------------------
416
417fn build_package_from_paragraph(
418    paragraph: &Rfc822Metadata,
419    source_meta: Option<&SourceMeta>,
420    datasource_id: DatasourceId,
421) -> Option<PackageData> {
422    let name = rfc822::get_header_first(&paragraph.headers, "package")?;
423    let version = rfc822::get_header_first(&paragraph.headers, "version");
424    let architecture = rfc822::get_header_first(&paragraph.headers, "architecture");
425    let description = rfc822::get_header_first(&paragraph.headers, "description");
426    let maintainer_str = rfc822::get_header_first(&paragraph.headers, "maintainer");
427    let homepage = rfc822::get_header_first(&paragraph.headers, "homepage");
428    let source_field = rfc822::get_header_first(&paragraph.headers, "source");
429    let section = rfc822::get_header_first(&paragraph.headers, "section");
430    let installed_size = rfc822::get_header_first(&paragraph.headers, "installed-size");
431    let multi_arch = rfc822::get_header_first(&paragraph.headers, "multi-arch");
432
433    let namespace = detect_namespace(version.as_deref(), maintainer_str.as_deref());
434
435    // Build parties: use source_meta parties if available, otherwise parse from paragraph
436    let parties = if let Some(meta) = source_meta {
437        meta.parties.clone()
438    } else {
439        let mut p = Vec::new();
440        if let Some(m) = &maintainer_str {
441            let (n, e) = split_name_email(m);
442            p.push(Party {
443                r#type: Some("person".to_string()),
444                role: Some("maintainer".to_string()),
445                name: n,
446                email: e,
447                url: None,
448                organization: None,
449                organization_url: None,
450                timezone: None,
451            });
452        }
453        p
454    };
455
456    // Resolve homepage: paragraph's own, or from source metadata
457    let homepage_url = homepage.or_else(|| source_meta.and_then(|m| m.homepage_url.clone()));
458    let vcs_url = source_meta.and_then(|m| m.vcs_url.clone());
459    let code_view_url = source_meta.and_then(|m| m.code_view_url.clone());
460    let bug_tracking_url = source_meta.and_then(|m| m.bug_tracking_url.clone());
461
462    // Build PURL
463    let purl = build_debian_purl(
464        &name,
465        version.as_deref(),
466        namespace.as_deref(),
467        architecture.as_deref(),
468    );
469
470    // Parse dependencies from all dependency fields
471    let dependencies = parse_all_dependencies(&paragraph.headers, namespace.as_deref());
472
473    // Keywords from section
474    let keywords = section.into_iter().collect();
475
476    // Source packages
477    let source_packages = parse_source_field(source_field.as_deref(), namespace.as_deref());
478
479    // Extra data
480    let mut extra_data: HashMap<String, serde_json::Value> = HashMap::new();
481    if let Some(ma) = &multi_arch
482        && !ma.is_empty()
483    {
484        extra_data.insert(
485            "multi_arch".to_string(),
486            serde_json::Value::String(ma.clone()),
487        );
488    }
489    if let Some(size_str) = &installed_size
490        && let Ok(size) = size_str.parse::<u64>()
491    {
492        extra_data.insert(
493            "installed_size".to_string(),
494            serde_json::Value::Number(serde_json::Number::from(size)),
495        );
496    }
497
498    // Qualifiers for architecture
499    let qualifiers = architecture.as_ref().map(|arch| {
500        let mut q = HashMap::new();
501        q.insert("arch".to_string(), arch.clone());
502        q
503    });
504
505    Some(PackageData {
506        package_type: Some(PACKAGE_TYPE),
507        namespace: namespace.clone(),
508        name: Some(name),
509        version,
510        qualifiers,
511        subpath: None,
512        primary_language: None,
513        description,
514        release_date: None,
515        parties,
516        keywords,
517        homepage_url,
518        download_url: None,
519        size: None,
520        sha1: None,
521        md5: None,
522        sha256: None,
523        sha512: None,
524        bug_tracking_url,
525        code_view_url,
526        vcs_url,
527        copyright: None,
528        holder: None,
529        declared_license_expression: None,
530        declared_license_expression_spdx: None,
531        license_detections: Vec::new(),
532        other_license_expression: None,
533        other_license_expression_spdx: None,
534        other_license_detections: Vec::new(),
535        extracted_license_statement: None,
536        notice_text: None,
537        source_packages,
538        file_references: Vec::new(),
539        is_private: false,
540        is_virtual: false,
541        extra_data: if extra_data.is_empty() {
542            None
543        } else {
544            Some(extra_data)
545        },
546        dependencies,
547        repository_homepage_url: None,
548        repository_download_url: None,
549        api_data_url: None,
550        datasource_id: Some(datasource_id),
551        purl,
552    })
553}
554
555fn build_package_from_source_paragraph(paragraph: &Rfc822Metadata) -> Option<PackageData> {
556    let name = rfc822::get_header_first(&paragraph.headers, "source")?;
557    let version = rfc822::get_header_first(&paragraph.headers, "version");
558    let maintainer_str = rfc822::get_header_first(&paragraph.headers, "maintainer");
559
560    let namespace = detect_namespace(version.as_deref(), maintainer_str.as_deref());
561    let source_meta = extract_source_meta(paragraph);
562
563    let purl = build_debian_purl(&name, version.as_deref(), namespace.as_deref(), None);
564    let dependencies = parse_all_dependencies(&paragraph.headers, namespace.as_deref());
565
566    let section = rfc822::get_header_first(&paragraph.headers, "section");
567    let keywords = section.into_iter().collect();
568
569    Some(PackageData {
570        package_type: Some(PACKAGE_TYPE),
571        namespace: namespace.clone(),
572        name: Some(name),
573        version,
574        qualifiers: None,
575        subpath: None,
576        primary_language: None,
577        description: None,
578        release_date: None,
579        parties: source_meta.parties,
580        keywords,
581        homepage_url: source_meta.homepage_url,
582        download_url: None,
583        size: None,
584        sha1: None,
585        md5: None,
586        sha256: None,
587        sha512: None,
588        bug_tracking_url: source_meta.bug_tracking_url,
589        code_view_url: source_meta.code_view_url,
590        vcs_url: source_meta.vcs_url,
591        copyright: None,
592        holder: None,
593        declared_license_expression: None,
594        declared_license_expression_spdx: None,
595        license_detections: Vec::new(),
596        other_license_expression: None,
597        other_license_expression_spdx: None,
598        other_license_detections: Vec::new(),
599        extracted_license_statement: None,
600        notice_text: None,
601        source_packages: Vec::new(),
602        file_references: Vec::new(),
603        is_private: false,
604        is_virtual: false,
605        extra_data: None,
606        dependencies,
607        repository_homepage_url: None,
608        repository_download_url: None,
609        api_data_url: None,
610        datasource_id: Some(DatasourceId::DebianControlInSource),
611        purl,
612    })
613}
614
615// ---------------------------------------------------------------------------
616// Namespace detection
617// ---------------------------------------------------------------------------
618
619fn detect_namespace(version: Option<&str>, maintainer: Option<&str>) -> Option<String> {
620    // Check version clues first
621    if let Some(ver) = version {
622        let ver_lower = ver.to_lowercase();
623        for clue in VERSION_CLUES_UBUNTU {
624            if ver_lower.contains(clue) {
625                return Some("ubuntu".to_string());
626            }
627        }
628        for clue in VERSION_CLUES_DEBIAN {
629            if ver_lower.contains(clue) {
630                return Some("debian".to_string());
631            }
632        }
633    }
634
635    // Check maintainer clues
636    if let Some(maint) = maintainer {
637        let maint_lower = maint.to_lowercase();
638        for clue in MAINTAINER_CLUES_UBUNTU {
639            if maint_lower.contains(clue) {
640                return Some("ubuntu".to_string());
641            }
642        }
643        for clue in MAINTAINER_CLUES_DEBIAN {
644            if maint_lower.contains(clue) {
645                return Some("debian".to_string());
646            }
647        }
648    }
649
650    // Default to debian
651    Some("debian".to_string())
652}
653
654// ---------------------------------------------------------------------------
655// PURL generation
656// ---------------------------------------------------------------------------
657
658fn build_debian_purl(
659    name: &str,
660    version: Option<&str>,
661    namespace: Option<&str>,
662    architecture: Option<&str>,
663) -> Option<String> {
664    let mut purl = PackageUrl::new(PACKAGE_TYPE.as_str(), name).ok()?;
665
666    if let Some(ns) = namespace {
667        purl.with_namespace(ns).ok()?;
668    }
669
670    if let Some(ver) = version {
671        purl.with_version(ver).ok()?;
672    }
673
674    if let Some(arch) = architecture {
675        purl.add_qualifier("arch", arch).ok()?;
676    }
677
678    Some(purl.to_string())
679}
680
681// ---------------------------------------------------------------------------
682// Dependency parsing
683// ---------------------------------------------------------------------------
684
685fn parse_all_dependencies(
686    headers: &HashMap<String, Vec<String>>,
687    namespace: Option<&str>,
688) -> Vec<Dependency> {
689    let mut dependencies = Vec::new();
690
691    for spec in DEP_FIELDS {
692        if let Some(dep_str) = rfc822::get_header_first(headers, spec.field) {
693            dependencies.extend(parse_dependency_field(
694                &dep_str,
695                spec.scope,
696                spec.is_runtime,
697                spec.is_optional,
698                namespace,
699            ));
700        }
701    }
702
703    dependencies
704}
705
706/// Parses a Debian dependency field value.
707///
708/// Debian dependencies are comma-separated, with optional version constraints
709/// in parentheses and alternative packages separated by `|`.
710///
711/// Format: `pkg1 (>= 1.0), pkg2 | pkg3 (<< 2.0), pkg4`
712///
713/// Alternatives (|) are treated as separate optional dependencies.
714fn parse_dependency_field(
715    dep_str: &str,
716    scope: &str,
717    is_runtime: bool,
718    is_optional: bool,
719    namespace: Option<&str>,
720) -> Vec<Dependency> {
721    let mut deps = Vec::new();
722
723    // Regex for parsing individual dependency: name (operator version)
724    // Debian operators: <<, <=, =, >=, >>
725    let dep_re = Regex::new(
726        r"^\s*([a-zA-Z0-9][a-zA-Z0-9.+\-]+)\s*(?:\(([<>=!]+)\s*([^)]+)\))?\s*(?:\[.*\])?\s*$",
727    )
728    .unwrap();
729
730    for group in dep_str.split(',') {
731        let group = group.trim();
732        if group.is_empty() {
733            continue;
734        }
735
736        // Handle alternatives (|)
737        let alternatives: Vec<&str> = group.split('|').collect();
738        let has_alternatives = alternatives.len() > 1;
739
740        for alt in alternatives {
741            let alt = alt.trim();
742            if alt.is_empty() {
743                continue;
744            }
745
746            if let Some(caps) = dep_re.captures(alt) {
747                let pkg_name = caps.get(1).map(|m| m.as_str().trim()).unwrap_or("");
748                let operator = caps.get(2).map(|m| m.as_str().trim());
749                let version = caps.get(3).map(|m| m.as_str().trim());
750
751                if pkg_name.is_empty() {
752                    continue;
753                }
754
755                // Skip substitution variables like ${shlibs:Depends}
756                if pkg_name.starts_with('$') {
757                    continue;
758                }
759
760                let extracted_requirement = match (operator, version) {
761                    (Some(op), Some(ver)) => Some(format!("{} {}", op, ver)),
762                    _ => None,
763                };
764
765                let is_pinned = operator.map(|op| op == "=");
766
767                let purl = build_debian_purl(pkg_name, None, namespace, None);
768
769                deps.push(Dependency {
770                    purl,
771                    extracted_requirement,
772                    scope: Some(scope.to_string()),
773                    is_runtime: Some(is_runtime),
774                    is_optional: Some(is_optional || has_alternatives),
775                    is_pinned,
776                    is_direct: Some(true),
777                    resolved_package: None,
778                    extra_data: None,
779                });
780            }
781        }
782    }
783
784    deps
785}
786
787// ---------------------------------------------------------------------------
788// Source field parsing
789// ---------------------------------------------------------------------------
790
791/// Parses the Source field which may contain a version in parentheses.
792///
793/// Format: `source-name` or `source-name (version)`
794fn parse_source_field(source: Option<&str>, namespace: Option<&str>) -> Vec<String> {
795    let Some(source_str) = source else {
796        return Vec::new();
797    };
798
799    let trimmed = source_str.trim();
800    if trimmed.is_empty() {
801        return Vec::new();
802    }
803
804    // Extract name and optional version from "name (version)" format
805    let (name, version) = if let Some(paren_start) = trimmed.find(" (") {
806        let name = trimmed[..paren_start].trim();
807        let version = trimmed[paren_start + 2..].trim_end_matches(')').trim();
808        (
809            name,
810            if version.is_empty() {
811                None
812            } else {
813                Some(version)
814            },
815        )
816    } else {
817        (trimmed, None)
818    };
819
820    if let Some(purl) = build_debian_purl(name, version, namespace, None) {
821        vec![purl]
822    } else {
823        Vec::new()
824    }
825}
826
827// ---------------------------------------------------------------------------
828// Parser registration macros
829// ---------------------------------------------------------------------------
830
831crate::register_parser!(
832    "Debian source package control file (debian/control)",
833    &["**/debian/control"],
834    "deb",
835    "",
836    Some("https://www.debian.org/doc/debian-policy/ch-controlfields.html"),
837);
838
839// Note: DebianInstalledParser uses try_parse_installed for Vec<PackageData>,
840// but we register it for the single-package interface too.
841
842// ============================================================================
843// WAVE 2 PARSERS: Additional Debian Format Support
844// ============================================================================
845
846/// Parser for Debian Source Control (.dsc) files
847pub struct DebianDscParser;
848
849impl PackageParser for DebianDscParser {
850    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
851
852    fn is_match(path: &Path) -> bool {
853        path.extension().and_then(|e| e.to_str()) == Some("dsc")
854    }
855
856    fn extract_packages(path: &Path) -> Vec<PackageData> {
857        let content = match read_file_to_string(path) {
858            Ok(c) => c,
859            Err(e) => {
860                warn!("Failed to read .dsc file {:?}: {}", path, e);
861                return vec![default_package_data(DatasourceId::DebianSourceControlDsc)];
862            }
863        };
864
865        vec![parse_dsc_content(&content)]
866    }
867}
868
869fn strip_pgp_signature(content: &str) -> String {
870    let mut result = String::new();
871    let mut in_pgp_block = false;
872    let mut in_signature = false;
873
874    for line in content.lines() {
875        if line.starts_with("-----BEGIN PGP SIGNED MESSAGE-----") {
876            in_pgp_block = true;
877            continue;
878        }
879        if line.starts_with("-----BEGIN PGP SIGNATURE-----") {
880            in_signature = true;
881            continue;
882        }
883        if line.starts_with("-----END PGP SIGNATURE-----") {
884            in_signature = false;
885            continue;
886        }
887        if in_pgp_block && line.starts_with("Hash:") {
888            continue;
889        }
890        if in_pgp_block && line.is_empty() && result.is_empty() {
891            in_pgp_block = false;
892            continue;
893        }
894        if !in_signature {
895            result.push_str(line);
896            result.push('\n');
897        }
898    }
899
900    result
901}
902
903fn parse_dsc_content(content: &str) -> PackageData {
904    let clean_content = strip_pgp_signature(content);
905    let metadata = rfc822::parse_rfc822_content(&clean_content);
906    let headers = &metadata.headers;
907
908    let name = rfc822::get_header_first(headers, "source");
909    let version = rfc822::get_header_first(headers, "version");
910    let architecture = rfc822::get_header_first(headers, "architecture");
911    let namespace = Some("debian".to_string());
912
913    let mut package = PackageData {
914        datasource_id: Some(DatasourceId::DebianSourceControlDsc),
915        package_type: Some(PACKAGE_TYPE),
916        namespace: namespace.clone(),
917        name: name.clone(),
918        version: version.clone(),
919        description: rfc822::get_header_first(headers, "description"),
920        homepage_url: rfc822::get_header_first(headers, "homepage"),
921        vcs_url: rfc822::get_header_first(headers, "vcs-git"),
922        code_view_url: rfc822::get_header_first(headers, "vcs-browser"),
923        ..Default::default()
924    };
925
926    // Build PURL with architecture qualifier
927    if let (Some(n), Some(v)) = (&name, &version) {
928        package.purl = build_debian_purl(n, Some(v), namespace.as_deref(), architecture.as_deref());
929    }
930
931    // Set source_packages to point to the source itself (without version)
932    if let Some(n) = &name
933        && let Some(source_purl) = build_debian_purl(n, None, namespace.as_deref(), None)
934    {
935        package.source_packages.push(source_purl);
936    }
937
938    if let Some(maintainer) = rfc822::get_header_first(headers, "maintainer") {
939        let (name_opt, email_opt) = split_name_email(&maintainer);
940        package.parties.push(Party {
941            r#type: None,
942            role: Some("maintainer".to_string()),
943            name: name_opt,
944            email: email_opt,
945            url: None,
946            organization: None,
947            organization_url: None,
948            timezone: None,
949        });
950    }
951
952    if let Some(uploaders_str) = rfc822::get_header_first(headers, "uploaders") {
953        for uploader in uploaders_str.split(',') {
954            let uploader = uploader.trim();
955            if uploader.is_empty() {
956                continue;
957            }
958            let (name_opt, email_opt) = split_name_email(uploader);
959            package.parties.push(Party {
960                r#type: None,
961                role: Some("uploader".to_string()),
962                name: name_opt,
963                email: email_opt,
964                url: None,
965                organization: None,
966                organization_url: None,
967                timezone: None,
968            });
969        }
970    }
971
972    // Parse Build-Depends
973    if let Some(build_deps) = rfc822::get_header_first(headers, "build-depends") {
974        package.dependencies.extend(parse_dependency_field(
975            &build_deps,
976            "build",
977            false,
978            false,
979            namespace.as_deref(),
980        ));
981    }
982
983    // Store Standards-Version in extra_data
984    if let Some(standards) = rfc822::get_header_first(headers, "standards-version") {
985        let map = package.extra_data.get_or_insert_with(HashMap::new);
986        map.insert("standards_version".to_string(), standards.into());
987    }
988
989    package
990}
991
992/// Parser for Debian original source tarballs (*.orig.tar.*)
993pub struct DebianOrigTarParser;
994
995impl PackageParser for DebianOrigTarParser {
996    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
997
998    fn is_match(path: &Path) -> bool {
999        path.file_name()
1000            .and_then(|n| n.to_str())
1001            .map(|name| name.contains(".orig.tar."))
1002            .unwrap_or(false)
1003    }
1004
1005    fn extract_packages(path: &Path) -> Vec<PackageData> {
1006        let filename = match path.file_name().and_then(|n| n.to_str()) {
1007            Some(f) => f,
1008            None => {
1009                return vec![default_package_data(
1010                    DatasourceId::DebianOriginalSourceTarball,
1011                )];
1012            }
1013        };
1014
1015        vec![parse_source_tarball_filename(
1016            filename,
1017            DatasourceId::DebianOriginalSourceTarball,
1018        )]
1019    }
1020}
1021
1022/// Parser for Debian source package metadata tarballs (*.debian.tar.*)
1023pub struct DebianDebianTarParser;
1024
1025impl PackageParser for DebianDebianTarParser {
1026    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
1027
1028    fn is_match(path: &Path) -> bool {
1029        path.file_name()
1030            .and_then(|n| n.to_str())
1031            .map(|name| name.contains(".debian.tar."))
1032            .unwrap_or(false)
1033    }
1034
1035    fn extract_packages(path: &Path) -> Vec<PackageData> {
1036        let filename = match path.file_name().and_then(|n| n.to_str()) {
1037            Some(f) => f,
1038            None => {
1039                return vec![default_package_data(
1040                    DatasourceId::DebianSourceMetadataTarball,
1041                )];
1042            }
1043        };
1044
1045        vec![parse_source_tarball_filename(
1046            filename,
1047            DatasourceId::DebianSourceMetadataTarball,
1048        )]
1049    }
1050}
1051
1052fn parse_source_tarball_filename(filename: &str, datasource_id: DatasourceId) -> PackageData {
1053    let without_tar_ext = filename
1054        .trim_end_matches(".gz")
1055        .trim_end_matches(".xz")
1056        .trim_end_matches(".bz2")
1057        .trim_end_matches(".tar");
1058
1059    let parts: Vec<&str> = without_tar_ext.splitn(2, '_').collect();
1060    if parts.len() < 2 {
1061        return default_package_data(datasource_id);
1062    }
1063
1064    let name = parts[0].to_string();
1065    let version_with_suffix = parts[1];
1066
1067    let version = version_with_suffix
1068        .trim_end_matches(".orig")
1069        .trim_end_matches(".debian")
1070        .to_string();
1071
1072    let namespace = Some("debian".to_string());
1073
1074    PackageData {
1075        datasource_id: Some(datasource_id),
1076        package_type: Some(PACKAGE_TYPE),
1077        namespace: namespace.clone(),
1078        name: Some(name.clone()),
1079        version: Some(version.clone()),
1080        purl: build_debian_purl(&name, Some(&version), namespace.as_deref(), None),
1081        ..Default::default()
1082    }
1083}
1084
1085/// Parser for Debian installed file lists (*.list)
1086pub struct DebianInstalledListParser;
1087
1088impl PackageParser for DebianInstalledListParser {
1089    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
1090
1091    fn is_match(path: &Path) -> bool {
1092        path.extension().and_then(|e| e.to_str()) == Some("list")
1093            && path
1094                .to_str()
1095                .map(|p| p.contains("/var/lib/dpkg/info/"))
1096                .unwrap_or(false)
1097    }
1098
1099    fn extract_packages(path: &Path) -> Vec<PackageData> {
1100        let filename = match path.file_stem().and_then(|s| s.to_str()) {
1101            Some(f) => f,
1102            None => {
1103                return vec![default_package_data(DatasourceId::DebianInstalledFilesList)];
1104            }
1105        };
1106
1107        let content = match read_file_to_string(path) {
1108            Ok(c) => c,
1109            Err(e) => {
1110                warn!("Failed to read .list file {:?}: {}", path, e);
1111                return vec![default_package_data(DatasourceId::DebianInstalledFilesList)];
1112            }
1113        };
1114
1115        vec![parse_debian_file_list(
1116            &content,
1117            filename,
1118            DatasourceId::DebianInstalledFilesList,
1119        )]
1120    }
1121}
1122
1123/// Parser for Debian installed MD5 checksum files (*.md5sums)
1124pub struct DebianInstalledMd5sumsParser;
1125
1126impl PackageParser for DebianInstalledMd5sumsParser {
1127    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
1128
1129    fn is_match(path: &Path) -> bool {
1130        path.extension().and_then(|e| e.to_str()) == Some("md5sums")
1131            && path
1132                .to_str()
1133                .map(|p| p.contains("/var/lib/dpkg/info/"))
1134                .unwrap_or(false)
1135    }
1136
1137    fn extract_packages(path: &Path) -> Vec<PackageData> {
1138        let filename = match path.file_stem().and_then(|s| s.to_str()) {
1139            Some(f) => f,
1140            None => {
1141                return vec![default_package_data(DatasourceId::DebianInstalledMd5Sums)];
1142            }
1143        };
1144
1145        let content = match read_file_to_string(path) {
1146            Ok(c) => c,
1147            Err(e) => {
1148                warn!("Failed to read .md5sums file {:?}: {}", path, e);
1149                return vec![default_package_data(DatasourceId::DebianInstalledMd5Sums)];
1150            }
1151        };
1152
1153        vec![parse_debian_file_list(
1154            &content,
1155            filename,
1156            DatasourceId::DebianInstalledMd5Sums,
1157        )]
1158    }
1159}
1160
1161const IGNORED_ROOT_DIRS: &[&str] = &["/.", "/bin", "/etc", "/lib", "/sbin", "/usr", "/var"];
1162
1163fn parse_debian_file_list(
1164    content: &str,
1165    filename: &str,
1166    datasource_id: DatasourceId,
1167) -> PackageData {
1168    let (name, arch_qualifier) = if let Some((pkg, arch)) = filename.split_once(':') {
1169        (Some(pkg.to_string()), Some(arch.to_string()))
1170    } else if filename == "md5sums" {
1171        (None, None)
1172    } else {
1173        (Some(filename.to_string()), None)
1174    };
1175
1176    let mut file_references = Vec::new();
1177
1178    for line in content.lines() {
1179        let line = line.trim();
1180        if line.is_empty() || line.starts_with('#') {
1181            continue;
1182        }
1183
1184        let (md5sum, path) = if let Some((hash, p)) = line.split_once(' ') {
1185            (Some(hash.trim().to_string()), p.trim())
1186        } else {
1187            (None, line)
1188        };
1189
1190        if IGNORED_ROOT_DIRS.contains(&path) {
1191            continue;
1192        }
1193
1194        file_references.push(FileReference {
1195            path: path.to_string(),
1196            size: None,
1197            sha1: None,
1198            md5: md5sum,
1199            sha256: None,
1200            sha512: None,
1201            extra_data: None,
1202        });
1203    }
1204
1205    if file_references.is_empty() {
1206        return default_package_data(datasource_id);
1207    }
1208
1209    let namespace = Some("debian".to_string());
1210    let mut package = PackageData {
1211        datasource_id: Some(datasource_id),
1212        package_type: Some(PACKAGE_TYPE),
1213        namespace: namespace.clone(),
1214        name: name.clone(),
1215        file_references,
1216        ..Default::default()
1217    };
1218
1219    if let Some(n) = &name {
1220        package.purl = build_debian_purl(n, None, namespace.as_deref(), arch_qualifier.as_deref());
1221    }
1222
1223    package
1224}
1225
1226/// Parser for Debian machine-readable copyright files (DEP-5 format)
1227pub struct DebianCopyrightParser;
1228
1229impl PackageParser for DebianCopyrightParser {
1230    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
1231
1232    fn is_match(path: &Path) -> bool {
1233        if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
1234            if filename != "copyright" {
1235                return false;
1236            }
1237            let path_str = path.to_string_lossy();
1238            path_str.contains("/debian/")
1239                || path_str.contains("/usr/share/doc/")
1240                || path_str.ends_with("debian/copyright")
1241        } else {
1242            false
1243        }
1244    }
1245
1246    fn extract_packages(path: &Path) -> Vec<PackageData> {
1247        let content = match read_file_to_string(path) {
1248            Ok(c) => c,
1249            Err(e) => {
1250                warn!("Failed to read copyright file {:?}: {}", path, e);
1251                return vec![default_package_data(DatasourceId::DebianCopyright)];
1252            }
1253        };
1254
1255        let package_name = extract_package_name_from_path(path);
1256        vec![parse_copyright_file(&content, package_name.as_deref())]
1257    }
1258}
1259
1260fn extract_package_name_from_path(path: &Path) -> Option<String> {
1261    let components: Vec<_> = path.components().collect();
1262
1263    for (i, component) in components.iter().enumerate() {
1264        if let std::path::Component::Normal(os_str) = component
1265            && os_str.to_str() == Some("doc")
1266            && i + 1 < components.len()
1267            && let std::path::Component::Normal(next) = components[i + 1]
1268        {
1269            return next.to_str().map(|s| s.to_string());
1270        }
1271    }
1272    None
1273}
1274
1275fn parse_copyright_file(content: &str, package_name: Option<&str>) -> PackageData {
1276    let paragraphs = parse_copyright_paragraphs_with_lines(content);
1277
1278    let is_dep5 = paragraphs
1279        .first()
1280        .and_then(|p| rfc822::get_header_first(&p.metadata.headers, "format"))
1281        .is_some();
1282
1283    let namespace = Some("debian".to_string());
1284    let mut parties = Vec::new();
1285    let mut license_statements = Vec::new();
1286    let mut primary_license_detection = None;
1287    let mut header_license_detection = None;
1288    let mut other_license_detections = Vec::new();
1289
1290    if is_dep5 {
1291        for para in &paragraphs {
1292            if let Some(copyright_text) =
1293                rfc822::get_header_first(&para.metadata.headers, "copyright")
1294            {
1295                for holder in parse_copyright_holders(&copyright_text) {
1296                    if !holder.is_empty() {
1297                        parties.push(Party {
1298                            r#type: None,
1299                            role: Some("copyright-holder".to_string()),
1300                            name: Some(holder),
1301                            email: None,
1302                            url: None,
1303                            organization: None,
1304                            organization_url: None,
1305                            timezone: None,
1306                        });
1307                    }
1308                }
1309            }
1310
1311            if let Some(license) = rfc822::get_header_first(&para.metadata.headers, "license") {
1312                let license_name = license.lines().next().unwrap_or(&license).trim();
1313                if !license_name.is_empty()
1314                    && !license_statements.contains(&license_name.to_string())
1315                {
1316                    license_statements.push(license_name.to_string());
1317                }
1318
1319                if let Some((matched_text, line_no)) = para.license_header_line.clone() {
1320                    let detection =
1321                        build_primary_license_detection(license_name, matched_text, line_no);
1322                    let is_header_paragraph =
1323                        rfc822::get_header_first(&para.metadata.headers, "format").is_some();
1324                    if rfc822::get_header_first(&para.metadata.headers, "files").as_deref()
1325                        == Some("*")
1326                    {
1327                        primary_license_detection = Some(detection);
1328                    } else if is_header_paragraph {
1329                        header_license_detection.get_or_insert(detection);
1330                    } else {
1331                        other_license_detections.push(detection);
1332                    }
1333                }
1334            }
1335        }
1336
1337        if primary_license_detection.is_none() && header_license_detection.is_some() {
1338            primary_license_detection = header_license_detection;
1339        }
1340    } else {
1341        let copyright_block = extract_unstructured_field(content, "Copyright:");
1342        if let Some(text) = copyright_block {
1343            for holder in parse_copyright_holders(&text) {
1344                if !holder.is_empty() {
1345                    parties.push(Party {
1346                        r#type: None,
1347                        role: Some("copyright-holder".to_string()),
1348                        name: Some(holder),
1349                        email: None,
1350                        url: None,
1351                        organization: None,
1352                        organization_url: None,
1353                        timezone: None,
1354                    });
1355                }
1356            }
1357        }
1358
1359        let license_block = extract_unstructured_field(content, "License:");
1360        if let Some(text) = license_block {
1361            license_statements.push(text.lines().next().unwrap_or(&text).trim().to_string());
1362        }
1363    }
1364
1365    let extracted_license_statement = if license_statements.is_empty() {
1366        None
1367    } else {
1368        Some(license_statements.join(" AND "))
1369    };
1370
1371    let license_detections = primary_license_detection.into_iter().collect::<Vec<_>>();
1372    let declared_license_expression = license_detections
1373        .first()
1374        .map(|detection| detection.license_expression.clone());
1375    let declared_license_expression_spdx = license_detections
1376        .first()
1377        .map(|detection| detection.license_expression_spdx.clone());
1378    let other_license_expression = combine_license_expressions(
1379        other_license_detections
1380            .iter()
1381            .map(|detection| detection.license_expression.clone()),
1382    );
1383    let other_license_expression_spdx = combine_license_expressions(
1384        other_license_detections
1385            .iter()
1386            .map(|detection| detection.license_expression_spdx.clone()),
1387    );
1388
1389    PackageData {
1390        datasource_id: Some(DatasourceId::DebianCopyright),
1391        package_type: Some(PACKAGE_TYPE),
1392        namespace: namespace.clone(),
1393        name: package_name.map(|s| s.to_string()),
1394        parties,
1395        declared_license_expression,
1396        declared_license_expression_spdx,
1397        license_detections,
1398        other_license_expression,
1399        other_license_expression_spdx,
1400        other_license_detections,
1401        extracted_license_statement,
1402        purl: package_name.and_then(|n| build_debian_purl(n, None, namespace.as_deref(), None)),
1403        ..Default::default()
1404    }
1405}
1406
1407#[derive(Debug)]
1408struct CopyrightParagraph {
1409    metadata: Rfc822Metadata,
1410    license_header_line: Option<(String, usize)>,
1411}
1412
1413fn parse_copyright_paragraphs_with_lines(content: &str) -> Vec<CopyrightParagraph> {
1414    let mut paragraphs = Vec::new();
1415    let mut current_lines = Vec::new();
1416    let mut current_start_line = 1usize;
1417
1418    for (idx, line) in content.lines().enumerate() {
1419        let line_no = idx + 1;
1420        if line.is_empty() {
1421            if !current_lines.is_empty() {
1422                paragraphs.push(finalize_copyright_paragraph(
1423                    std::mem::take(&mut current_lines),
1424                    current_start_line,
1425                ));
1426            }
1427            current_start_line = line_no + 1;
1428        } else {
1429            if current_lines.is_empty() {
1430                current_start_line = line_no;
1431            }
1432            current_lines.push(line.to_string());
1433        }
1434    }
1435
1436    if !current_lines.is_empty() {
1437        paragraphs.push(finalize_copyright_paragraph(
1438            current_lines,
1439            current_start_line,
1440        ));
1441    }
1442
1443    paragraphs
1444}
1445
1446fn finalize_copyright_paragraph(raw_lines: Vec<String>, start_line: usize) -> CopyrightParagraph {
1447    let mut headers: HashMap<String, Vec<String>> = HashMap::new();
1448    let mut current_name: Option<String> = None;
1449    let mut current_value = String::new();
1450    let mut license_header_line = None;
1451
1452    for (idx, line) in raw_lines.iter().enumerate() {
1453        if line.starts_with(' ') || line.starts_with('\t') {
1454            if current_name.is_some() {
1455                current_value.push('\n');
1456                current_value.push_str(line);
1457            }
1458            continue;
1459        }
1460
1461        if let Some(name) = current_name.take() {
1462            add_copyright_header_value(&mut headers, &name, &current_value);
1463            current_value.clear();
1464        }
1465
1466        if let Some((name, value)) = line.split_once(':') {
1467            let normalized_name = name.trim().to_ascii_lowercase();
1468            if normalized_name == "license" && license_header_line.is_none() {
1469                license_header_line = Some((line.trim_end().to_string(), start_line + idx));
1470            }
1471            current_name = Some(normalized_name);
1472            current_value = value.trim_start().to_string();
1473        }
1474    }
1475
1476    if let Some(name) = current_name.take() {
1477        add_copyright_header_value(&mut headers, &name, &current_value);
1478    }
1479
1480    CopyrightParagraph {
1481        metadata: Rfc822Metadata {
1482            headers,
1483            body: String::new(),
1484        },
1485        license_header_line,
1486    }
1487}
1488
1489fn add_copyright_header_value(headers: &mut HashMap<String, Vec<String>>, name: &str, value: &str) {
1490    let entry = headers.entry(name.to_string()).or_default();
1491    let trimmed = value.trim_end();
1492    if !trimmed.is_empty() {
1493        entry.push(trimmed.to_string());
1494    }
1495}
1496
1497fn build_primary_license_detection(
1498    license_name: &str,
1499    matched_text: String,
1500    line_no: usize,
1501) -> LicenseDetection {
1502    let normalized = normalize_debian_license_name(license_name);
1503
1504    build_declared_license_detection(
1505        &normalized,
1506        DeclaredLicenseMatchMetadata::new(&matched_text, line_no, line_no),
1507    )
1508}
1509
1510fn normalize_debian_license_name(license_name: &str) -> NormalizedDeclaredLicense {
1511    match license_name.trim() {
1512        "GPL-2+" => NormalizedDeclaredLicense::new("gpl-2.0-plus", "GPL-2.0-or-later"),
1513        "GPL-2" => NormalizedDeclaredLicense::new("gpl-2.0", "GPL-2.0-only"),
1514        "LGPL-2+" => NormalizedDeclaredLicense::new("lgpl-2.0-plus", "LGPL-2.0-or-later"),
1515        "LGPL-2.1" => NormalizedDeclaredLicense::new("lgpl-2.1", "LGPL-2.1-only"),
1516        "LGPL-2.1+" => NormalizedDeclaredLicense::new("lgpl-2.1-plus", "LGPL-2.1-or-later"),
1517        "LGPL-3+" => NormalizedDeclaredLicense::new("lgpl-3.0-plus", "LGPL-3.0-or-later"),
1518        "BSD-4-clause" => NormalizedDeclaredLicense::new("bsd-original-uc", "BSD-4-Clause-UC"),
1519        "public-domain" => {
1520            NormalizedDeclaredLicense::new("public-domain", "LicenseRef-provenant-public-domain")
1521        }
1522        other => normalize_declared_license_key(other)
1523            .unwrap_or_else(|| NormalizedDeclaredLicense::new(other.to_ascii_lowercase(), other)),
1524    }
1525}
1526
1527fn parse_copyright_holders(text: &str) -> Vec<String> {
1528    let mut holders = Vec::new();
1529
1530    for line in text.lines() {
1531        let line = line.trim();
1532        if line.is_empty() {
1533            continue;
1534        }
1535
1536        let cleaned = line
1537            .trim_start_matches("Copyright")
1538            .trim_start_matches("copyright")
1539            .trim_start_matches("(C)")
1540            .trim_start_matches("(c)")
1541            .trim_start_matches("©")
1542            .trim();
1543
1544        if let Some(year_end) = cleaned.find(char::is_alphabetic) {
1545            let without_years = &cleaned[year_end..];
1546            let holder = without_years
1547                .trim_start_matches(',')
1548                .trim_start_matches('-')
1549                .trim();
1550
1551            if !holder.is_empty() && holder.len() > 2 {
1552                holders.push(holder.to_string());
1553            }
1554        }
1555    }
1556
1557    holders
1558}
1559
1560fn extract_unstructured_field(content: &str, field_name: &str) -> Option<String> {
1561    let mut in_field = false;
1562    let mut field_content = String::new();
1563
1564    for line in content.lines() {
1565        if line.starts_with(field_name) {
1566            in_field = true;
1567            field_content.push_str(line.trim_start_matches(field_name).trim());
1568            field_content.push('\n');
1569        } else if in_field {
1570            if line.starts_with(char::is_whitespace) {
1571                field_content.push_str(line.trim());
1572                field_content.push('\n');
1573            } else if !line.trim().is_empty() {
1574                break;
1575            }
1576        }
1577    }
1578
1579    let trimmed = field_content.trim();
1580    if trimmed.is_empty() {
1581        None
1582    } else {
1583        Some(trimmed.to_string())
1584    }
1585}
1586
1587/// Parser for Debian binary package archives (.deb files)
1588pub struct DebianDebParser;
1589
1590impl PackageParser for DebianDebParser {
1591    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
1592
1593    fn is_match(path: &Path) -> bool {
1594        path.extension().and_then(|e| e.to_str()) == Some("deb")
1595    }
1596
1597    fn extract_packages(path: &Path) -> Vec<PackageData> {
1598        // Try to extract metadata from archive contents first
1599        if let Ok(data) = extract_deb_archive(path) {
1600            return vec![data];
1601        }
1602
1603        // Fallback to filename parsing
1604        let filename = match path.file_name().and_then(|n| n.to_str()) {
1605            Some(f) => f,
1606            None => {
1607                return vec![default_package_data(DatasourceId::DebianDeb)];
1608            }
1609        };
1610
1611        vec![parse_deb_filename(filename)]
1612    }
1613}
1614
1615fn extract_deb_archive(path: &Path) -> Result<PackageData, String> {
1616    use flate2::read::GzDecoder;
1617    use liblzma::read::XzDecoder;
1618    use std::io::{Cursor, Read};
1619
1620    let file = std::fs::File::open(path).map_err(|e| format!("Failed to open .deb file: {}", e))?;
1621
1622    let mut archive = ar::Archive::new(file);
1623    let mut package: Option<PackageData> = None;
1624
1625    while let Some(entry_result) = archive.next_entry() {
1626        let mut entry = entry_result.map_err(|e| format!("Failed to read ar entry: {}", e))?;
1627
1628        let entry_name = std::str::from_utf8(entry.header().identifier())
1629            .map_err(|e| format!("Invalid entry name: {}", e))?;
1630        let entry_name = entry_name.trim().to_string();
1631
1632        if entry_name == "control.tar.gz" || entry_name.starts_with("control.tar") {
1633            let mut control_data = Vec::new();
1634            entry
1635                .read_to_end(&mut control_data)
1636                .map_err(|e| format!("Failed to read control.tar.gz: {}", e))?;
1637
1638            if entry_name.ends_with(".gz") {
1639                let decoder = GzDecoder::new(Cursor::new(control_data));
1640                if let Some(parsed_package) = parse_control_tar_archive(decoder)? {
1641                    package = Some(parsed_package);
1642                }
1643            } else if entry_name.ends_with(".xz") {
1644                let decoder = XzDecoder::new(Cursor::new(control_data));
1645                if let Some(parsed_package) = parse_control_tar_archive(decoder)? {
1646                    package = Some(parsed_package);
1647                }
1648            }
1649        } else if entry_name.starts_with("data.tar") {
1650            let mut data = Vec::new();
1651            entry
1652                .read_to_end(&mut data)
1653                .map_err(|e| format!("Failed to read data archive: {}", e))?;
1654
1655            let Some(current_package) = package.as_mut() else {
1656                continue;
1657            };
1658
1659            if entry_name.ends_with(".gz") {
1660                let decoder = GzDecoder::new(Cursor::new(data));
1661                merge_deb_data_archive(decoder, current_package)?;
1662            } else if entry_name.ends_with(".xz") {
1663                let decoder = XzDecoder::new(Cursor::new(data));
1664                merge_deb_data_archive(decoder, current_package)?;
1665            }
1666        }
1667    }
1668
1669    package.ok_or_else(|| ".deb archive does not contain control.tar.* metadata".to_string())
1670}
1671
1672fn parse_control_tar_archive<R: std::io::Read>(reader: R) -> Result<Option<PackageData>, String> {
1673    use std::io::Read;
1674
1675    let mut tar_archive = tar::Archive::new(reader);
1676
1677    for tar_entry_result in tar_archive
1678        .entries()
1679        .map_err(|e| format!("Failed to read tar entries: {}", e))?
1680    {
1681        let mut tar_entry =
1682            tar_entry_result.map_err(|e| format!("Failed to read tar entry: {}", e))?;
1683
1684        let tar_path = tar_entry
1685            .path()
1686            .map_err(|e| format!("Failed to get tar path: {}", e))?;
1687
1688        if tar_path.ends_with("control") {
1689            let mut control_content = String::new();
1690            tar_entry
1691                .read_to_string(&mut control_content)
1692                .map_err(|e| format!("Failed to read control file: {}", e))?;
1693
1694            let paragraphs = rfc822::parse_rfc822_paragraphs(&control_content);
1695            if paragraphs.is_empty() {
1696                return Err("No paragraphs in control file".to_string());
1697            }
1698
1699            if let Some(package) =
1700                build_package_from_paragraph(&paragraphs[0], None, DatasourceId::DebianDeb)
1701            {
1702                return Ok(Some(package));
1703            }
1704
1705            return Err("Failed to parse control file".to_string());
1706        }
1707    }
1708
1709    Ok(None)
1710}
1711
1712fn merge_deb_data_archive<R: std::io::Read>(
1713    reader: R,
1714    package: &mut PackageData,
1715) -> Result<(), String> {
1716    use std::io::Read;
1717
1718    let mut tar_archive = tar::Archive::new(reader);
1719
1720    for tar_entry_result in tar_archive
1721        .entries()
1722        .map_err(|e| format!("Failed to read data tar entries: {}", e))?
1723    {
1724        let mut tar_entry =
1725            tar_entry_result.map_err(|e| format!("Failed to read data tar entry: {}", e))?;
1726
1727        let tar_path = tar_entry
1728            .path()
1729            .map_err(|e| format!("Failed to get data tar path: {}", e))?;
1730        let tar_path_str = tar_path.to_string_lossy();
1731
1732        if tar_path_str.ends_with(&format!(
1733            "/usr/share/doc/{}/copyright",
1734            package.name.as_deref().unwrap_or_default()
1735        )) || tar_path_str.ends_with(&format!(
1736            "usr/share/doc/{}/copyright",
1737            package.name.as_deref().unwrap_or_default()
1738        )) {
1739            let mut copyright_content = String::new();
1740            tar_entry
1741                .read_to_string(&mut copyright_content)
1742                .map_err(|e| format!("Failed to read copyright file from data tar: {}", e))?;
1743
1744            let copyright_pkg = parse_copyright_file(&copyright_content, package.name.as_deref());
1745            merge_debian_copyright_into_package(package, &copyright_pkg);
1746            break;
1747        }
1748    }
1749
1750    Ok(())
1751}
1752
1753fn merge_debian_copyright_into_package(target: &mut PackageData, copyright: &PackageData) {
1754    if target.extracted_license_statement.is_none() {
1755        target.extracted_license_statement = copyright.extracted_license_statement.clone();
1756    }
1757
1758    for party in &copyright.parties {
1759        if !target.parties.iter().any(|existing| {
1760            existing.r#type == party.r#type
1761                && existing.role == party.role
1762                && existing.name == party.name
1763                && existing.email == party.email
1764                && existing.url == party.url
1765                && existing.organization == party.organization
1766                && existing.organization_url == party.organization_url
1767                && existing.timezone == party.timezone
1768        }) {
1769            target.parties.push(party.clone());
1770        }
1771    }
1772}
1773
1774fn parse_deb_filename(filename: &str) -> PackageData {
1775    let without_ext = filename.trim_end_matches(".deb");
1776
1777    let parts: Vec<&str> = without_ext.split('_').collect();
1778    if parts.len() < 2 {
1779        return default_package_data(DatasourceId::DebianDeb);
1780    }
1781
1782    let name = parts[0].to_string();
1783    let version = parts[1].to_string();
1784    let architecture = if parts.len() >= 3 {
1785        Some(parts[2].to_string())
1786    } else {
1787        None
1788    };
1789
1790    let namespace = Some("debian".to_string());
1791
1792    PackageData {
1793        datasource_id: Some(DatasourceId::DebianDeb),
1794        package_type: Some(PACKAGE_TYPE),
1795        namespace: namespace.clone(),
1796        name: Some(name.clone()),
1797        version: Some(version.clone()),
1798        purl: build_debian_purl(
1799            &name,
1800            Some(&version),
1801            namespace.as_deref(),
1802            architecture.as_deref(),
1803        ),
1804        ..Default::default()
1805    }
1806}
1807
1808/// Parser for control files inside extracted .deb control tarballs.
1809///
1810/// Matches paths like `*/control.tar.gz-extract/control` and
1811/// `*/control.tar.xz-extract/control` which are created by ExtractCode
1812/// when extracting .deb archives.
1813pub struct DebianControlInExtractedDebParser;
1814
1815impl PackageParser for DebianControlInExtractedDebParser {
1816    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
1817
1818    fn is_match(path: &Path) -> bool {
1819        path.file_name()
1820            .and_then(|n| n.to_str())
1821            .is_some_and(|name| name == "control")
1822            && path
1823                .to_str()
1824                .map(|p| {
1825                    p.ends_with("control.tar.gz-extract/control")
1826                        || p.ends_with("control.tar.xz-extract/control")
1827                })
1828                .unwrap_or(false)
1829    }
1830
1831    fn extract_packages(path: &Path) -> Vec<PackageData> {
1832        let content = match read_file_to_string(path) {
1833            Ok(c) => c,
1834            Err(e) => {
1835                warn!(
1836                    "Failed to read control file in extracted deb {:?}: {}",
1837                    path, e
1838                );
1839                return vec![default_package_data(
1840                    DatasourceId::DebianControlExtractedDeb,
1841                )];
1842            }
1843        };
1844
1845        // A control file inside an extracted .deb has a single paragraph
1846        // (unlike debian/control which has source + binary paragraphs)
1847        let paragraphs = rfc822::parse_rfc822_paragraphs(&content);
1848        if paragraphs.is_empty() {
1849            return vec![default_package_data(
1850                DatasourceId::DebianControlExtractedDeb,
1851            )];
1852        }
1853
1854        if let Some(pkg) = build_package_from_paragraph(
1855            &paragraphs[0],
1856            None,
1857            DatasourceId::DebianControlExtractedDeb,
1858        ) {
1859            vec![pkg]
1860        } else {
1861            vec![default_package_data(
1862                DatasourceId::DebianControlExtractedDeb,
1863            )]
1864        }
1865    }
1866}
1867
1868/// Parser for MD5 checksum files inside extracted .deb control tarballs
1869pub struct DebianMd5sumInPackageParser;
1870
1871impl PackageParser for DebianMd5sumInPackageParser {
1872    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
1873
1874    fn is_match(path: &Path) -> bool {
1875        path.file_name()
1876            .and_then(|n| n.to_str())
1877            .is_some_and(|name| name == "md5sums")
1878            && path
1879                .to_str()
1880                .map(|p| {
1881                    p.ends_with("control.tar.gz-extract/md5sums")
1882                        || p.ends_with("control.tar.xz-extract/md5sums")
1883                })
1884                .unwrap_or(false)
1885    }
1886
1887    fn extract_packages(path: &Path) -> Vec<PackageData> {
1888        let content = match read_file_to_string(path) {
1889            Ok(c) => c,
1890            Err(e) => {
1891                warn!("Failed to read md5sums file {:?}: {}", path, e);
1892                return vec![default_package_data(
1893                    DatasourceId::DebianMd5SumsInExtractedDeb,
1894                )];
1895            }
1896        };
1897
1898        let package_name = extract_package_name_from_deb_path(path);
1899
1900        vec![parse_md5sums_in_package(&content, package_name.as_deref())]
1901    }
1902}
1903
1904pub(crate) fn extract_package_name_from_deb_path(path: &Path) -> Option<String> {
1905    let parent = path.parent()?;
1906    let grandparent = parent.parent()?;
1907    let dirname = grandparent.file_name()?.to_str()?;
1908    let without_extract = dirname.strip_suffix("-extract")?;
1909    let without_deb = without_extract.strip_suffix(".deb")?;
1910    let name = without_deb.split('_').next()?;
1911
1912    Some(name.to_string())
1913}
1914
1915fn parse_md5sums_in_package(content: &str, package_name: Option<&str>) -> PackageData {
1916    let mut file_references = Vec::new();
1917
1918    for line in content.lines() {
1919        let line = line.trim();
1920        if line.is_empty() || line.starts_with('#') {
1921            continue;
1922        }
1923
1924        let (md5sum, filepath): (Option<String>, &str) = if let Some(idx) = line.find("  ") {
1925            (Some(line[..idx].trim().to_string()), line[idx + 2..].trim())
1926        } else if let Some((hash, path)) = line.split_once(' ') {
1927            (Some(hash.trim().to_string()), path.trim())
1928        } else {
1929            (None, line)
1930        };
1931
1932        if IGNORED_ROOT_DIRS.contains(&filepath) {
1933            continue;
1934        }
1935
1936        file_references.push(FileReference {
1937            path: filepath.to_string(),
1938            size: None,
1939            sha1: None,
1940            md5: md5sum,
1941            sha256: None,
1942            sha512: None,
1943            extra_data: None,
1944        });
1945    }
1946
1947    if file_references.is_empty() {
1948        return default_package_data(DatasourceId::DebianMd5SumsInExtractedDeb);
1949    }
1950
1951    let namespace = Some("debian".to_string());
1952    let mut package = PackageData {
1953        datasource_id: Some(DatasourceId::DebianMd5SumsInExtractedDeb),
1954        package_type: Some(PACKAGE_TYPE),
1955        namespace: namespace.clone(),
1956        name: package_name.map(|s| s.to_string()),
1957        file_references,
1958        ..Default::default()
1959    };
1960
1961    if let Some(n) = &package.name {
1962        package.purl = build_debian_purl(n, None, namespace.as_deref(), None);
1963    }
1964
1965    package
1966}
1967
1968crate::register_parser!(
1969    "Debian control file in extracted .deb control tarball",
1970    &[
1971        "**/control.tar.gz-extract/control",
1972        "**/control.tar.xz-extract/control"
1973    ],
1974    "deb",
1975    "",
1976    Some("https://www.debian.org/doc/debian-policy/ch-controlfields.html"),
1977);
1978
1979crate::register_parser!(
1980    "Debian MD5 checksums in extracted .deb control tarball",
1981    &[
1982        "**/control.tar.gz-extract/md5sums",
1983        "**/control.tar.xz-extract/md5sums"
1984    ],
1985    "deb",
1986    "",
1987    Some("https://www.debian.org/doc/debian-policy/ch-controlfields.html"),
1988);
1989
1990#[cfg(test)]
1991mod tests {
1992    use super::*;
1993    use crate::models::DatasourceId;
1994    use crate::models::PackageType;
1995    use ar::{Builder as ArBuilder, Header as ArHeader};
1996    use flate2::Compression;
1997    use flate2::write::GzEncoder;
1998    use liblzma::write::XzEncoder;
1999    use std::io::Cursor;
2000    use std::path::PathBuf;
2001    use tar::{Builder as TarBuilder, Header as TarHeader};
2002    use tempfile::NamedTempFile;
2003
2004    fn create_synthetic_deb_with_control_tar_xz() -> NamedTempFile {
2005        let mut control_tar = Vec::new();
2006        {
2007            let encoder = XzEncoder::new(&mut control_tar, 6);
2008            let mut tar_builder = TarBuilder::new(encoder);
2009
2010            let control_content = b"Package: synthetic\nVersion: 1.2.3\nArchitecture: amd64\nDescription: Synthetic deb\nHomepage: https://example.com\n";
2011            let mut header = TarHeader::new_gnu();
2012            header
2013                .set_path("control")
2014                .expect("control tar path should be valid");
2015            header.set_size(control_content.len() as u64);
2016            header.set_mode(0o644);
2017            header.set_cksum();
2018            tar_builder
2019                .append(&header, Cursor::new(control_content))
2020                .expect("control file should be appended to tar.xz");
2021            tar_builder.finish().expect("control tar.xz should finish");
2022        }
2023
2024        let deb = NamedTempFile::new().expect("temp deb file should be created");
2025        {
2026            let mut builder = ArBuilder::new(
2027                deb.reopen()
2028                    .expect("temporary deb file should reopen for writing"),
2029            );
2030
2031            let debian_binary = b"2.0\n";
2032            let mut debian_binary_header =
2033                ArHeader::new(b"debian-binary".to_vec(), debian_binary.len() as u64);
2034            debian_binary_header.set_mode(0o100644);
2035            builder
2036                .append(&debian_binary_header, Cursor::new(debian_binary))
2037                .expect("debian-binary entry should be appended");
2038
2039            let mut control_header =
2040                ArHeader::new(b"control.tar.xz".to_vec(), control_tar.len() as u64);
2041            control_header.set_mode(0o100644);
2042            builder
2043                .append(&control_header, Cursor::new(control_tar))
2044                .expect("control.tar.xz entry should be appended");
2045        }
2046
2047        deb
2048    }
2049
2050    fn create_synthetic_deb_with_copyright() -> NamedTempFile {
2051        let mut control_tar = Vec::new();
2052        {
2053            let encoder = GzEncoder::new(&mut control_tar, Compression::default());
2054            let mut tar_builder = TarBuilder::new(encoder);
2055
2056            let control_content = b"Package: synthetic\nVersion: 9.9.9\nArchitecture: all\nDescription: Synthetic deb with copyright\n";
2057            let mut header = TarHeader::new_gnu();
2058            header
2059                .set_path("control")
2060                .expect("control tar path should be valid");
2061            header.set_size(control_content.len() as u64);
2062            header.set_mode(0o644);
2063            header.set_cksum();
2064            tar_builder
2065                .append(&header, Cursor::new(control_content))
2066                .expect("control file should be appended to tar.gz");
2067            tar_builder.finish().expect("control tar.gz should finish");
2068        }
2069
2070        let mut data_tar = Vec::new();
2071        {
2072            let encoder = GzEncoder::new(&mut data_tar, Compression::default());
2073            let mut tar_builder = TarBuilder::new(encoder);
2074
2075            let copyright = b"Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/\nFiles: *\nCopyright: 2024 Example Org\nLicense: Apache-2.0\n Licensed under the Apache License, Version 2.0.\n";
2076            let mut header = TarHeader::new_gnu();
2077            header
2078                .set_path("./usr/share/doc/synthetic/copyright")
2079                .expect("copyright path should be valid");
2080            header.set_size(copyright.len() as u64);
2081            header.set_mode(0o644);
2082            header.set_cksum();
2083            tar_builder
2084                .append(&header, Cursor::new(copyright))
2085                .expect("copyright file should be appended to data tar");
2086            tar_builder.finish().expect("data tar.gz should finish");
2087        }
2088
2089        let deb = NamedTempFile::new().expect("temp deb file should be created");
2090        {
2091            let mut builder = ArBuilder::new(
2092                deb.reopen()
2093                    .expect("temporary deb file should reopen for writing"),
2094            );
2095
2096            let debian_binary = b"2.0\n";
2097            let mut debian_binary_header =
2098                ArHeader::new(b"debian-binary".to_vec(), debian_binary.len() as u64);
2099            debian_binary_header.set_mode(0o100644);
2100            builder
2101                .append(&debian_binary_header, Cursor::new(debian_binary))
2102                .expect("debian-binary entry should be appended");
2103
2104            let mut control_header =
2105                ArHeader::new(b"control.tar.gz".to_vec(), control_tar.len() as u64);
2106            control_header.set_mode(0o100644);
2107            builder
2108                .append(&control_header, Cursor::new(control_tar))
2109                .expect("control.tar.gz entry should be appended");
2110
2111            let mut data_header = ArHeader::new(b"data.tar.gz".to_vec(), data_tar.len() as u64);
2112            data_header.set_mode(0o100644);
2113            builder
2114                .append(&data_header, Cursor::new(data_tar))
2115                .expect("data.tar.gz entry should be appended");
2116        }
2117
2118        deb
2119    }
2120
2121    // ====== Namespace detection ======
2122
2123    #[test]
2124    fn test_detect_namespace_from_ubuntu_version() {
2125        assert_eq!(
2126            detect_namespace(Some("1.0-1ubuntu1"), None),
2127            Some("ubuntu".to_string())
2128        );
2129    }
2130
2131    #[test]
2132    fn test_detect_namespace_from_debian_version() {
2133        assert_eq!(
2134            detect_namespace(Some("1.0-1+deb11u1"), None),
2135            Some("debian".to_string())
2136        );
2137    }
2138
2139    #[test]
2140    fn test_detect_namespace_from_ubuntu_maintainer() {
2141        assert_eq!(
2142            detect_namespace(
2143                None,
2144                Some("Ubuntu Developers <ubuntu-devel-discuss@lists.ubuntu.com>")
2145            ),
2146            Some("ubuntu".to_string())
2147        );
2148    }
2149
2150    #[test]
2151    fn test_detect_namespace_from_debian_maintainer() {
2152        assert_eq!(
2153            detect_namespace(None, Some("John Doe <john@debian.org>")),
2154            Some("debian".to_string())
2155        );
2156    }
2157
2158    #[test]
2159    fn test_detect_namespace_default() {
2160        assert_eq!(
2161            detect_namespace(None, Some("Unknown <unknown@example.com>")),
2162            Some("debian".to_string())
2163        );
2164    }
2165
2166    #[test]
2167    fn test_detect_namespace_version_takes_priority() {
2168        // Version clue should be checked before maintainer
2169        assert_eq!(
2170            detect_namespace(Some("1.0ubuntu1"), Some("maintainer@debian.org")),
2171            Some("ubuntu".to_string())
2172        );
2173    }
2174
2175    // ====== PURL generation ======
2176
2177    #[test]
2178    fn test_build_purl_basic() {
2179        let purl = build_debian_purl("curl", Some("7.68.0-1"), Some("debian"), Some("amd64"));
2180        assert_eq!(
2181            purl,
2182            Some("pkg:deb/debian/curl@7.68.0-1?arch=amd64".to_string())
2183        );
2184    }
2185
2186    #[test]
2187    fn test_build_purl_no_version() {
2188        let purl = build_debian_purl("curl", None, Some("debian"), Some("any"));
2189        assert_eq!(purl, Some("pkg:deb/debian/curl?arch=any".to_string()));
2190    }
2191
2192    #[test]
2193    fn test_build_purl_no_arch() {
2194        let purl = build_debian_purl("curl", Some("7.68.0"), Some("ubuntu"), None);
2195        assert_eq!(purl, Some("pkg:deb/ubuntu/curl@7.68.0".to_string()));
2196    }
2197
2198    #[test]
2199    fn test_build_purl_no_namespace() {
2200        let purl = build_debian_purl("curl", Some("7.68.0"), None, None);
2201        assert_eq!(purl, Some("pkg:deb/curl@7.68.0".to_string()));
2202    }
2203
2204    // ====== Dependency parsing ======
2205
2206    #[test]
2207    fn test_parse_simple_dependency() {
2208        let deps = parse_dependency_field("libc6", "depends", true, false, Some("debian"));
2209        assert_eq!(deps.len(), 1);
2210        assert_eq!(deps[0].purl, Some("pkg:deb/debian/libc6".to_string()));
2211        assert_eq!(deps[0].extracted_requirement, None);
2212        assert_eq!(deps[0].scope, Some("depends".to_string()));
2213    }
2214
2215    #[test]
2216    fn test_parse_dependency_with_version() {
2217        let deps =
2218            parse_dependency_field("libc6 (>= 2.17)", "depends", true, false, Some("debian"));
2219        assert_eq!(deps.len(), 1);
2220        assert_eq!(deps[0].purl, Some("pkg:deb/debian/libc6".to_string()));
2221        assert_eq!(deps[0].extracted_requirement, Some(">= 2.17".to_string()));
2222    }
2223
2224    #[test]
2225    fn test_parse_dependency_exact_version() {
2226        let deps = parse_dependency_field(
2227            "libc6 (= 2.31-13+deb11u5)",
2228            "depends",
2229            true,
2230            false,
2231            Some("debian"),
2232        );
2233        assert_eq!(deps.len(), 1);
2234        assert_eq!(deps[0].is_pinned, Some(true));
2235    }
2236
2237    #[test]
2238    fn test_parse_dependency_strict_less() {
2239        let deps =
2240            parse_dependency_field("libgcc-s1 (<< 12)", "breaks", false, false, Some("debian"));
2241        assert_eq!(deps.len(), 1);
2242        assert_eq!(deps[0].extracted_requirement, Some("<< 12".to_string()));
2243        assert_eq!(deps[0].scope, Some("breaks".to_string()));
2244    }
2245
2246    #[test]
2247    fn test_parse_multiple_dependencies() {
2248        let deps = parse_dependency_field(
2249            "libc6 (>= 2.17), libssl1.1 (>= 1.1.0), zlib1g (>= 1:1.2.0)",
2250            "depends",
2251            true,
2252            false,
2253            Some("debian"),
2254        );
2255        assert_eq!(deps.len(), 3);
2256    }
2257
2258    #[test]
2259    fn test_parse_dependency_alternatives() {
2260        let deps = parse_dependency_field(
2261            "libssl1.1 | libssl3",
2262            "depends",
2263            true,
2264            false,
2265            Some("debian"),
2266        );
2267        assert_eq!(deps.len(), 2);
2268        // Alternatives are marked as optional
2269        assert_eq!(deps[0].is_optional, Some(true));
2270        assert_eq!(deps[1].is_optional, Some(true));
2271    }
2272
2273    #[test]
2274    fn test_parse_dependency_skips_substitutions() {
2275        let deps = parse_dependency_field(
2276            "${shlibs:Depends}, ${misc:Depends}, libc6",
2277            "depends",
2278            true,
2279            false,
2280            Some("debian"),
2281        );
2282        assert_eq!(deps.len(), 1);
2283        assert_eq!(deps[0].purl, Some("pkg:deb/debian/libc6".to_string()));
2284    }
2285
2286    #[test]
2287    fn test_parse_dependency_with_arch_qualifier() {
2288        // Dependencies can have [arch] qualifiers which we ignore
2289        let deps = parse_dependency_field(
2290            "libc6 (>= 2.17) [amd64]",
2291            "depends",
2292            true,
2293            false,
2294            Some("debian"),
2295        );
2296        assert_eq!(deps.len(), 1);
2297        assert_eq!(deps[0].purl, Some("pkg:deb/debian/libc6".to_string()));
2298    }
2299
2300    #[test]
2301    fn test_parse_empty_dependency() {
2302        let deps = parse_dependency_field("", "depends", true, false, Some("debian"));
2303        assert!(deps.is_empty());
2304    }
2305
2306    // ====== Source field parsing ======
2307
2308    #[test]
2309    fn test_parse_source_field_name_only() {
2310        let sources = parse_source_field(Some("util-linux"), Some("debian"));
2311        assert_eq!(sources.len(), 1);
2312        assert_eq!(sources[0], "pkg:deb/debian/util-linux");
2313    }
2314
2315    #[test]
2316    fn test_parse_source_field_with_version() {
2317        let sources = parse_source_field(Some("util-linux (2.36.1-8+deb11u1)"), Some("debian"));
2318        assert_eq!(sources.len(), 1);
2319        assert_eq!(sources[0], "pkg:deb/debian/util-linux@2.36.1-8%2Bdeb11u1");
2320    }
2321
2322    #[test]
2323    fn test_parse_source_field_empty() {
2324        let sources = parse_source_field(None, Some("debian"));
2325        assert!(sources.is_empty());
2326    }
2327
2328    // ====== Control file parsing ======
2329
2330    #[test]
2331    fn test_parse_debian_control_source_and_binary() {
2332        let content = "\
2333Source: curl
2334Section: web
2335Priority: optional
2336Maintainer: Alessandro Ghedini <ghedo@debian.org>
2337Homepage: https://curl.se/
2338Vcs-Browser: https://salsa.debian.org/debian/curl
2339Vcs-Git: https://salsa.debian.org/debian/curl.git
2340Build-Depends: debhelper (>= 12), libssl-dev
2341
2342Package: curl
2343Architecture: amd64
2344Depends: libc6 (>= 2.17), libcurl4 (= ${binary:Version})
2345Description: command line tool for transferring data with URL syntax";
2346
2347        let packages = parse_debian_control(content);
2348        assert_eq!(packages.len(), 1);
2349
2350        let pkg = &packages[0];
2351        assert_eq!(pkg.name, Some("curl".to_string()));
2352        assert_eq!(pkg.package_type, Some(PackageType::Deb));
2353        assert_eq!(pkg.homepage_url, Some("https://curl.se/".to_string()));
2354        assert_eq!(
2355            pkg.vcs_url,
2356            Some("https://salsa.debian.org/debian/curl.git".to_string())
2357        );
2358        assert_eq!(
2359            pkg.code_view_url,
2360            Some("https://salsa.debian.org/debian/curl".to_string())
2361        );
2362
2363        // Maintainer from source paragraph
2364        assert_eq!(pkg.parties.len(), 1);
2365        assert_eq!(pkg.parties[0].role, Some("maintainer".to_string()));
2366        assert_eq!(pkg.parties[0].name, Some("Alessandro Ghedini".to_string()));
2367        assert_eq!(pkg.parties[0].email, Some("ghedo@debian.org".to_string()));
2368
2369        // Dependencies parsed
2370        assert!(!pkg.dependencies.is_empty());
2371    }
2372
2373    #[test]
2374    fn test_parse_debian_control_multiple_binary() {
2375        let content = "\
2376Source: gzip
2377Maintainer: Debian Developer <dev@debian.org>
2378
2379Package: gzip
2380Architecture: any
2381Depends: libc6 (>= 2.17)
2382Description: GNU file compression
2383
2384Package: gzip-win32
2385Architecture: all
2386Description: gzip for Windows";
2387
2388        let packages = parse_debian_control(content);
2389        assert_eq!(packages.len(), 2);
2390        assert_eq!(packages[0].name, Some("gzip".to_string()));
2391        assert_eq!(packages[1].name, Some("gzip-win32".to_string()));
2392
2393        // Both inherit source maintainer
2394        assert_eq!(packages[0].parties.len(), 1);
2395        assert_eq!(packages[1].parties.len(), 1);
2396    }
2397
2398    #[test]
2399    fn test_parse_debian_control_source_only() {
2400        let content = "\
2401Source: my-package
2402Maintainer: Test User <test@debian.org>
2403Build-Depends: debhelper (>= 13)";
2404
2405        let packages = parse_debian_control(content);
2406        assert_eq!(packages.len(), 1);
2407        assert_eq!(packages[0].name, Some("my-package".to_string()));
2408        // Build-Depends parsed
2409        assert!(!packages[0].dependencies.is_empty());
2410        assert_eq!(
2411            packages[0].dependencies[0].scope,
2412            Some("build-depends".to_string())
2413        );
2414    }
2415
2416    #[test]
2417    fn test_parse_debian_control_with_uploaders() {
2418        let content = "\
2419Source: example
2420Maintainer: Main Dev <main@debian.org>
2421Uploaders: Alice <alice@example.com>, Bob <bob@example.com>
2422
2423Package: example
2424Architecture: any
2425Description: test package";
2426
2427        let packages = parse_debian_control(content);
2428        assert_eq!(packages.len(), 1);
2429        // 1 maintainer + 2 uploaders
2430        assert_eq!(packages[0].parties.len(), 3);
2431        assert_eq!(packages[0].parties[0].role, Some("maintainer".to_string()));
2432        assert_eq!(packages[0].parties[1].role, Some("uploader".to_string()));
2433        assert_eq!(packages[0].parties[2].role, Some("uploader".to_string()));
2434    }
2435
2436    #[test]
2437    fn test_parse_debian_control_vcs_git_with_branch() {
2438        let content = "\
2439Source: example
2440Maintainer: Dev <dev@debian.org>
2441Vcs-Git: https://salsa.debian.org/example.git -b main
2442
2443Package: example
2444Architecture: any
2445Description: test";
2446
2447        let packages = parse_debian_control(content);
2448        assert_eq!(packages.len(), 1);
2449        // Should only take the URL, not the branch
2450        assert_eq!(
2451            packages[0].vcs_url,
2452            Some("https://salsa.debian.org/example.git".to_string())
2453        );
2454    }
2455
2456    #[test]
2457    fn test_parse_debian_control_multi_arch() {
2458        let content = "\
2459Source: example
2460Maintainer: Dev <dev@debian.org>
2461
2462Package: libexample
2463Architecture: any
2464Multi-Arch: same
2465Description: shared library";
2466
2467        let packages = parse_debian_control(content);
2468        assert_eq!(packages.len(), 1);
2469        let extra = packages[0].extra_data.as_ref().unwrap();
2470        assert_eq!(
2471            extra.get("multi_arch"),
2472            Some(&serde_json::Value::String("same".to_string()))
2473        );
2474    }
2475
2476    // ====== dpkg/status parsing ======
2477
2478    #[test]
2479    fn test_parse_dpkg_status_basic() {
2480        let content = "\
2481Package: base-files
2482Status: install ok installed
2483Priority: required
2484Section: admin
2485Installed-Size: 391
2486Maintainer: Ubuntu Developers <ubuntu-devel-discuss@lists.ubuntu.com>
2487Architecture: amd64
2488Version: 11ubuntu5.6
2489Description: Debian base system miscellaneous files
2490Homepage: https://tracker.debian.org/pkg/base-files
2491
2492Package: not-installed
2493Status: deinstall ok config-files
2494Architecture: amd64
2495Version: 1.0
2496Description: This should be skipped";
2497
2498        let packages = parse_dpkg_status(content);
2499        assert_eq!(packages.len(), 1);
2500
2501        let pkg = &packages[0];
2502        assert_eq!(pkg.name, Some("base-files".to_string()));
2503        assert_eq!(pkg.version, Some("11ubuntu5.6".to_string()));
2504        assert_eq!(pkg.namespace, Some("ubuntu".to_string()));
2505        assert_eq!(
2506            pkg.datasource_id,
2507            Some(DatasourceId::DebianInstalledStatusDb)
2508        );
2509
2510        // Installed-Size in extra_data
2511        let extra = pkg.extra_data.as_ref().unwrap();
2512        assert_eq!(
2513            extra.get("installed_size"),
2514            Some(&serde_json::Value::Number(serde_json::Number::from(391)))
2515        );
2516    }
2517
2518    #[test]
2519    fn test_parse_dpkg_status_multiple_installed() {
2520        let content = "\
2521Package: libc6
2522Status: install ok installed
2523Architecture: amd64
2524Version: 2.31-13+deb11u5
2525Maintainer: GNU Libc Maintainers <debian-glibc@lists.debian.org>
2526Description: GNU C Library
2527
2528Package: zlib1g
2529Status: install ok installed
2530Architecture: amd64
2531Version: 1:1.2.11.dfsg-2+deb11u2
2532Maintainer: Mark Brown <broonie@debian.org>
2533Description: compression library";
2534
2535        let packages = parse_dpkg_status(content);
2536        assert_eq!(packages.len(), 2);
2537        assert_eq!(packages[0].name, Some("libc6".to_string()));
2538        assert_eq!(packages[1].name, Some("zlib1g".to_string()));
2539    }
2540
2541    #[test]
2542    fn test_parse_dpkg_status_with_dependencies() {
2543        let content = "\
2544Package: curl
2545Status: install ok installed
2546Architecture: amd64
2547Version: 7.74.0-1.3+deb11u7
2548Maintainer: Alessandro Ghedini <ghedo@debian.org>
2549Depends: libc6 (>= 2.17), libcurl4 (= 7.74.0-1.3+deb11u7)
2550Recommends: ca-certificates
2551Description: command line tool for transferring data with URL syntax";
2552
2553        let packages = parse_dpkg_status(content);
2554        assert_eq!(packages.len(), 1);
2555
2556        let deps = &packages[0].dependencies;
2557        // 2 from Depends + 1 from Recommends
2558        assert_eq!(deps.len(), 3);
2559
2560        // Check first dependency
2561        assert_eq!(deps[0].purl, Some("pkg:deb/debian/libc6".to_string()));
2562        assert_eq!(deps[0].scope, Some("depends".to_string()));
2563        assert_eq!(deps[0].extracted_requirement, Some(">= 2.17".to_string()));
2564
2565        // Check recommends
2566        assert_eq!(
2567            deps[2].purl,
2568            Some("pkg:deb/debian/ca-certificates".to_string())
2569        );
2570        assert_eq!(deps[2].scope, Some("recommends".to_string()));
2571        assert_eq!(deps[2].is_optional, Some(true));
2572    }
2573
2574    #[test]
2575    fn test_parse_dpkg_status_with_source() {
2576        let content = "\
2577Package: libncurses6
2578Status: install ok installed
2579Architecture: amd64
2580Source: ncurses (6.2+20201114-2+deb11u1)
2581Version: 6.2+20201114-2+deb11u1
2582Maintainer: Craig Small <csmall@debian.org>
2583Description: shared libraries for terminal handling";
2584
2585        let packages = parse_dpkg_status(content);
2586        assert_eq!(packages.len(), 1);
2587        assert!(!packages[0].source_packages.is_empty());
2588        // Source PURL should include version from parentheses
2589        assert!(packages[0].source_packages[0].contains("ncurses"));
2590    }
2591
2592    #[test]
2593    fn test_parse_dpkg_status_filters_not_installed() {
2594        let content = "\
2595Package: installed-pkg
2596Status: install ok installed
2597Version: 1.0
2598Architecture: amd64
2599Description: installed
2600
2601Package: half-installed
2602Status: install ok half-installed
2603Version: 2.0
2604Architecture: amd64
2605Description: half installed
2606
2607Package: deinstall-pkg
2608Status: deinstall ok config-files
2609Version: 3.0
2610Architecture: amd64
2611Description: deinstalled
2612
2613Package: purge-pkg
2614Status: purge ok not-installed
2615Version: 4.0
2616Architecture: amd64
2617Description: purged";
2618
2619        let packages = parse_dpkg_status(content);
2620        assert_eq!(packages.len(), 1);
2621        assert_eq!(packages[0].name, Some("installed-pkg".to_string()));
2622    }
2623
2624    #[test]
2625    fn test_parse_dpkg_status_empty() {
2626        let packages = parse_dpkg_status("");
2627        assert!(packages.is_empty());
2628    }
2629
2630    // ====== is_match tests ======
2631
2632    #[test]
2633    fn test_debian_control_is_match() {
2634        assert!(DebianControlParser::is_match(Path::new(
2635            "/path/to/debian/control"
2636        )));
2637        assert!(DebianControlParser::is_match(Path::new("debian/control")));
2638        assert!(!DebianControlParser::is_match(Path::new(
2639            "/path/to/control"
2640        )));
2641        assert!(!DebianControlParser::is_match(Path::new(
2642            "/path/to/debian/changelog"
2643        )));
2644    }
2645
2646    #[test]
2647    fn test_debian_installed_is_match() {
2648        assert!(DebianInstalledParser::is_match(Path::new(
2649            "/var/lib/dpkg/status"
2650        )));
2651        assert!(DebianInstalledParser::is_match(Path::new(
2652            "some/root/var/lib/dpkg/status"
2653        )));
2654        assert!(!DebianInstalledParser::is_match(Path::new(
2655            "/var/lib/dpkg/status.d/something"
2656        )));
2657        assert!(!DebianInstalledParser::is_match(Path::new(
2658            "/var/lib/dpkg/available"
2659        )));
2660    }
2661
2662    // ====== Edge cases ======
2663
2664    #[test]
2665    fn test_parse_debian_control_empty_input() {
2666        let packages = parse_debian_control("");
2667        assert!(packages.is_empty());
2668    }
2669
2670    #[test]
2671    fn test_parse_debian_control_malformed_input() {
2672        let content = "this is not a valid control file\nwith random text";
2673        let packages = parse_debian_control(content);
2674        // Should not panic, may return empty or partial results
2675        assert!(packages.is_empty());
2676    }
2677
2678    #[test]
2679    fn test_dependency_with_epoch_version() {
2680        // Debian versions can have epochs like 1:2.3.4
2681        let deps = parse_dependency_field(
2682            "zlib1g (>= 1:1.2.11)",
2683            "depends",
2684            true,
2685            false,
2686            Some("debian"),
2687        );
2688        assert_eq!(deps.len(), 1);
2689        assert_eq!(
2690            deps[0].extracted_requirement,
2691            Some(">= 1:1.2.11".to_string())
2692        );
2693    }
2694
2695    #[test]
2696    fn test_dependency_with_plus_in_name() {
2697        let deps =
2698            parse_dependency_field("libstdc++6 (>= 10)", "depends", true, false, Some("debian"));
2699        assert_eq!(deps.len(), 1);
2700        assert!(deps[0].purl.as_ref().unwrap().contains("libstdc%2B%2B6"));
2701    }
2702
2703    #[test]
2704    fn test_dsc_parser_is_match() {
2705        assert!(DebianDscParser::is_match(&PathBuf::from("package.dsc")));
2706        assert!(DebianDscParser::is_match(&PathBuf::from(
2707            "adduser_3.118+deb11u1.dsc"
2708        )));
2709        assert!(!DebianDscParser::is_match(&PathBuf::from("control")));
2710        assert!(!DebianDscParser::is_match(&PathBuf::from("package.txt")));
2711    }
2712
2713    #[test]
2714    fn test_dsc_parser_adduser() {
2715        let path = PathBuf::from("testdata/debian/dsc_files/adduser_3.118+deb11u1.dsc");
2716        let package = DebianDscParser::extract_first_package(&path);
2717
2718        assert_eq!(package.package_type, Some(PACKAGE_TYPE));
2719        assert_eq!(package.namespace, Some("debian".to_string()));
2720        assert_eq!(package.name, Some("adduser".to_string()));
2721        assert_eq!(package.version, Some("3.118+deb11u1".to_string()));
2722        assert_eq!(
2723            package.purl,
2724            Some("pkg:deb/debian/adduser@3.118%2Bdeb11u1?arch=all".to_string())
2725        );
2726        assert_eq!(
2727            package.vcs_url,
2728            Some("https://salsa.debian.org/debian/adduser.git".to_string())
2729        );
2730        assert_eq!(
2731            package.code_view_url,
2732            Some("https://salsa.debian.org/debian/adduser".to_string())
2733        );
2734        assert_eq!(
2735            package.datasource_id,
2736            Some(DatasourceId::DebianSourceControlDsc)
2737        );
2738
2739        assert_eq!(package.parties.len(), 2);
2740        assert_eq!(package.parties[0].role, Some("maintainer".to_string()));
2741        assert_eq!(
2742            package.parties[0].name,
2743            Some("Debian Adduser Developers".to_string())
2744        );
2745        assert_eq!(
2746            package.parties[0].email,
2747            Some("adduser@packages.debian.org".to_string())
2748        );
2749        assert_eq!(package.parties[0].r#type, None);
2750
2751        assert_eq!(package.parties[1].role, Some("uploader".to_string()));
2752        assert_eq!(package.parties[1].name, Some("Marc Haber".to_string()));
2753        assert_eq!(
2754            package.parties[1].email,
2755            Some("mh+debian-packages@zugschlus.de".to_string())
2756        );
2757        assert_eq!(package.parties[1].r#type, None);
2758
2759        assert_eq!(package.source_packages.len(), 1);
2760        assert_eq!(
2761            package.source_packages[0],
2762            "pkg:deb/debian/adduser".to_string()
2763        );
2764
2765        assert!(!package.dependencies.is_empty());
2766        let build_dep_names: Vec<String> = package
2767            .dependencies
2768            .iter()
2769            .filter_map(|d| d.purl.as_ref())
2770            .filter(|p| p.contains("po-debconf") || p.contains("debhelper"))
2771            .map(|p| p.to_string())
2772            .collect();
2773        assert!(build_dep_names.len() >= 2);
2774    }
2775
2776    #[test]
2777    fn test_dsc_parser_zsh() {
2778        let path = PathBuf::from("testdata/debian/dsc_files/zsh_5.7.1-1+deb10u1.dsc");
2779        let package = DebianDscParser::extract_first_package(&path);
2780
2781        assert_eq!(package.name, Some("zsh".to_string()));
2782        assert_eq!(package.version, Some("5.7.1-1+deb10u1".to_string()));
2783        assert_eq!(package.namespace, Some("debian".to_string()));
2784        assert!(package.purl.is_some());
2785        assert!(package.purl.as_ref().unwrap().contains("zsh"));
2786        assert!(package.purl.as_ref().unwrap().contains("5.7.1"));
2787    }
2788
2789    #[test]
2790    fn test_parse_dsc_content_basic() {
2791        let content = "Format: 3.0 (native)
2792Source: testpkg
2793Binary: testpkg
2794Architecture: amd64
2795Version: 1.0.0
2796Maintainer: Test User <test@example.com>
2797Standards-Version: 4.5.0
2798Build-Depends: debhelper (>= 12)
2799Files:
2800 abc123 1024 testpkg_1.0.0.tar.xz
2801";
2802
2803        let package = parse_dsc_content(content);
2804        assert_eq!(package.name, Some("testpkg".to_string()));
2805        assert_eq!(package.version, Some("1.0.0".to_string()));
2806        assert_eq!(package.namespace, Some("debian".to_string()));
2807        assert_eq!(package.parties.len(), 1);
2808        assert_eq!(package.parties[0].name, Some("Test User".to_string()));
2809        assert_eq!(
2810            package.parties[0].email,
2811            Some("test@example.com".to_string())
2812        );
2813        assert_eq!(package.dependencies.len(), 1);
2814        assert!(package.purl.as_ref().unwrap().contains("arch=amd64"));
2815    }
2816
2817    #[test]
2818    fn test_parse_dsc_content_with_uploaders() {
2819        let content = "Source: mypkg
2820Version: 2.0
2821Architecture: all
2822Maintainer: Main Dev <main@example.com>
2823Uploaders: Dev One <dev1@example.com>, Dev Two <dev2@example.com>
2824";
2825
2826        let package = parse_dsc_content(content);
2827        assert_eq!(package.parties.len(), 3);
2828        assert_eq!(package.parties[0].role, Some("maintainer".to_string()));
2829        assert_eq!(package.parties[1].role, Some("uploader".to_string()));
2830        assert_eq!(package.parties[2].role, Some("uploader".to_string()));
2831    }
2832
2833    #[test]
2834    fn test_orig_tar_parser_is_match() {
2835        assert!(DebianOrigTarParser::is_match(&PathBuf::from(
2836            "package_1.0.orig.tar.gz"
2837        )));
2838        assert!(DebianOrigTarParser::is_match(&PathBuf::from(
2839            "abseil_0~20200923.3.orig.tar.xz"
2840        )));
2841        assert!(!DebianOrigTarParser::is_match(&PathBuf::from(
2842            "package.debian.tar.gz"
2843        )));
2844        assert!(!DebianOrigTarParser::is_match(&PathBuf::from("control")));
2845    }
2846
2847    #[test]
2848    fn test_debian_tar_parser_is_match() {
2849        assert!(DebianDebianTarParser::is_match(&PathBuf::from(
2850            "package_1.0-1.debian.tar.xz"
2851        )));
2852        assert!(DebianDebianTarParser::is_match(&PathBuf::from(
2853            "abseil_20220623.1-1.debian.tar.gz"
2854        )));
2855        assert!(!DebianDebianTarParser::is_match(&PathBuf::from(
2856            "package.orig.tar.gz"
2857        )));
2858        assert!(!DebianDebianTarParser::is_match(&PathBuf::from("control")));
2859    }
2860
2861    #[test]
2862    fn test_parse_orig_tar_filename() {
2863        let pkg = parse_source_tarball_filename(
2864            "abseil_0~20200923.3.orig.tar.gz",
2865            DatasourceId::DebianOriginalSourceTarball,
2866        );
2867        assert_eq!(pkg.name, Some("abseil".to_string()));
2868        assert_eq!(pkg.version, Some("0~20200923.3".to_string()));
2869        assert_eq!(pkg.namespace, Some("debian".to_string()));
2870        assert_eq!(
2871            pkg.purl,
2872            Some("pkg:deb/debian/abseil@0~20200923.3".to_string())
2873        );
2874        assert_eq!(
2875            pkg.datasource_id,
2876            Some(DatasourceId::DebianOriginalSourceTarball)
2877        );
2878    }
2879
2880    #[test]
2881    fn test_parse_debian_tar_filename() {
2882        let pkg = parse_source_tarball_filename(
2883            "abseil_20220623.1-1.debian.tar.xz",
2884            DatasourceId::DebianSourceMetadataTarball,
2885        );
2886        assert_eq!(pkg.name, Some("abseil".to_string()));
2887        assert_eq!(pkg.version, Some("20220623.1-1".to_string()));
2888        assert_eq!(pkg.namespace, Some("debian".to_string()));
2889        assert_eq!(
2890            pkg.purl,
2891            Some("pkg:deb/debian/abseil@20220623.1-1".to_string())
2892        );
2893    }
2894
2895    #[test]
2896    fn test_parse_deb_filename() {
2897        let pkg = parse_deb_filename("nginx_1.18.0-1_amd64.deb");
2898        assert_eq!(pkg.name, Some("nginx".to_string()));
2899        assert_eq!(pkg.version, Some("1.18.0-1".to_string()));
2900
2901        let pkg = parse_deb_filename("invalid.deb");
2902        assert!(pkg.name.is_none());
2903        assert!(pkg.version.is_none());
2904    }
2905
2906    #[test]
2907    fn test_parse_source_tarball_various_compressions() {
2908        let pkg_gz = parse_source_tarball_filename(
2909            "test_1.0.orig.tar.gz",
2910            DatasourceId::DebianOriginalSourceTarball,
2911        );
2912        let pkg_xz = parse_source_tarball_filename(
2913            "test_1.0.orig.tar.xz",
2914            DatasourceId::DebianOriginalSourceTarball,
2915        );
2916        let pkg_bz2 = parse_source_tarball_filename(
2917            "test_1.0.orig.tar.bz2",
2918            DatasourceId::DebianOriginalSourceTarball,
2919        );
2920
2921        assert_eq!(pkg_gz.version, Some("1.0".to_string()));
2922        assert_eq!(pkg_xz.version, Some("1.0".to_string()));
2923        assert_eq!(pkg_bz2.version, Some("1.0".to_string()));
2924    }
2925
2926    #[test]
2927    fn test_parse_source_tarball_invalid_format() {
2928        let pkg = parse_source_tarball_filename(
2929            "invalid-no-underscore.tar.gz",
2930            DatasourceId::DebianOriginalSourceTarball,
2931        );
2932        assert!(pkg.name.is_none());
2933        assert!(pkg.version.is_none());
2934    }
2935
2936    #[test]
2937    fn test_list_parser_is_match() {
2938        assert!(DebianInstalledListParser::is_match(&PathBuf::from(
2939            "/var/lib/dpkg/info/bash.list"
2940        )));
2941        assert!(DebianInstalledListParser::is_match(&PathBuf::from(
2942            "/var/lib/dpkg/info/package:amd64.list"
2943        )));
2944        assert!(!DebianInstalledListParser::is_match(&PathBuf::from(
2945            "bash.list"
2946        )));
2947        assert!(!DebianInstalledListParser::is_match(&PathBuf::from(
2948            "/var/lib/dpkg/info/bash.md5sums"
2949        )));
2950    }
2951
2952    #[test]
2953    fn test_md5sums_parser_is_match() {
2954        assert!(DebianInstalledMd5sumsParser::is_match(&PathBuf::from(
2955            "/var/lib/dpkg/info/bash.md5sums"
2956        )));
2957        assert!(DebianInstalledMd5sumsParser::is_match(&PathBuf::from(
2958            "/var/lib/dpkg/info/package:amd64.md5sums"
2959        )));
2960        assert!(!DebianInstalledMd5sumsParser::is_match(&PathBuf::from(
2961            "bash.md5sums"
2962        )));
2963        assert!(!DebianInstalledMd5sumsParser::is_match(&PathBuf::from(
2964            "/var/lib/dpkg/info/bash.list"
2965        )));
2966    }
2967
2968    #[test]
2969    fn test_parse_debian_file_list_plain_list() {
2970        let content = "/.
2971/bin
2972/bin/bash
2973/usr/bin/bashbug
2974/usr/share/doc/bash/README
2975";
2976        let pkg = parse_debian_file_list(content, "bash", DatasourceId::DebianInstalledFilesList);
2977        assert_eq!(pkg.name, Some("bash".to_string()));
2978        assert_eq!(pkg.file_references.len(), 3);
2979        assert_eq!(pkg.file_references[0].path, "/bin/bash");
2980        assert_eq!(pkg.file_references[0].md5, None);
2981        assert_eq!(pkg.file_references[1].path, "/usr/bin/bashbug");
2982        assert_eq!(pkg.file_references[2].path, "/usr/share/doc/bash/README");
2983    }
2984
2985    #[test]
2986    fn test_parse_debian_file_list_md5sums() {
2987        let content = "77506afebd3b7e19e937a678a185b62e  bin/bash
29881c77d2031971b4e4c512ac952102cd85  usr/bin/bashbug
2989f55e3a16959b0bb8915cb5f219521c80  usr/share/doc/bash/COMPAT.gz
2990";
2991        let pkg = parse_debian_file_list(content, "bash", DatasourceId::DebianInstalledFilesList);
2992        assert_eq!(pkg.name, Some("bash".to_string()));
2993        assert_eq!(pkg.file_references.len(), 3);
2994        assert_eq!(pkg.file_references[0].path, "bin/bash");
2995        assert_eq!(
2996            pkg.file_references[0].md5,
2997            Some("77506afebd3b7e19e937a678a185b62e".to_string())
2998        );
2999        assert_eq!(pkg.file_references[1].path, "usr/bin/bashbug");
3000        assert_eq!(
3001            pkg.file_references[1].md5,
3002            Some("1c77d2031971b4e4c512ac952102cd85".to_string())
3003        );
3004    }
3005
3006    #[test]
3007    fn test_parse_debian_file_list_with_arch() {
3008        let content = "/usr/bin/foo
3009/usr/lib/x86_64-linux-gnu/libfoo.so
3010";
3011        let pkg = parse_debian_file_list(
3012            content,
3013            "libfoo:amd64",
3014            DatasourceId::DebianInstalledFilesList,
3015        );
3016        assert_eq!(pkg.name, Some("libfoo".to_string()));
3017        assert!(pkg.purl.is_some());
3018        assert!(pkg.purl.as_ref().unwrap().contains("arch=amd64"));
3019        assert_eq!(pkg.file_references.len(), 2);
3020    }
3021
3022    #[test]
3023    fn test_parse_debian_file_list_skips_comments_and_empty() {
3024        let content = "# This is a comment
3025/bin/bash
3026
3027/usr/bin/bashbug
3028  
3029";
3030        let pkg = parse_debian_file_list(content, "bash", DatasourceId::DebianInstalledFilesList);
3031        assert_eq!(pkg.file_references.len(), 2);
3032    }
3033
3034    #[test]
3035    fn test_parse_debian_file_list_md5sums_only() {
3036        let content = "abc123  usr/bin/tool
3037";
3038        let pkg =
3039            parse_debian_file_list(content, "md5sums", DatasourceId::DebianInstalledFilesList);
3040        assert_eq!(pkg.name, None);
3041        assert_eq!(pkg.file_references.len(), 1);
3042    }
3043
3044    #[test]
3045    fn test_parse_debian_file_list_ignores_root_dirs() {
3046        let content = "/.
3047/bin
3048/bin/bash
3049/etc
3050/usr
3051/var
3052";
3053        let pkg = parse_debian_file_list(content, "bash", DatasourceId::DebianInstalledFilesList);
3054        assert_eq!(pkg.file_references.len(), 1);
3055        assert_eq!(pkg.file_references[0].path, "/bin/bash");
3056    }
3057
3058    #[test]
3059    fn test_copyright_parser_is_match() {
3060        assert!(DebianCopyrightParser::is_match(&PathBuf::from(
3061            "/usr/share/doc/bash/copyright"
3062        )));
3063        assert!(DebianCopyrightParser::is_match(&PathBuf::from(
3064            "debian/copyright"
3065        )));
3066        assert!(!DebianCopyrightParser::is_match(&PathBuf::from(
3067            "copyright.txt"
3068        )));
3069        assert!(!DebianCopyrightParser::is_match(&PathBuf::from(
3070            "/etc/copyright"
3071        )));
3072    }
3073
3074    #[test]
3075    fn test_extract_package_name_from_path() {
3076        assert_eq!(
3077            extract_package_name_from_path(&PathBuf::from("/usr/share/doc/bash/copyright")),
3078            Some("bash".to_string())
3079        );
3080        assert_eq!(
3081            extract_package_name_from_path(&PathBuf::from("/usr/share/doc/libseccomp2/copyright")),
3082            Some("libseccomp2".to_string())
3083        );
3084        assert_eq!(
3085            extract_package_name_from_path(&PathBuf::from("debian/copyright")),
3086            None
3087        );
3088    }
3089
3090    #[test]
3091    fn test_parse_copyright_dep5_format() {
3092        let content = "Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
3093Upstream-Name: libseccomp
3094Source: https://sourceforge.net/projects/libseccomp/
3095
3096Files: *
3097Copyright: 2012 Paul Moore <pmoore@redhat.com>
3098 2012 Ashley Lai <adlai@us.ibm.com>
3099License: LGPL-2.1
3100
3101License: LGPL-2.1
3102 This library is free software
3103";
3104        let pkg = parse_copyright_file(content, Some("libseccomp"));
3105        assert_eq!(pkg.name, Some("libseccomp".to_string()));
3106        assert_eq!(pkg.namespace, Some("debian".to_string()));
3107        assert_eq!(pkg.datasource_id, Some(DatasourceId::DebianCopyright));
3108        assert_eq!(
3109            pkg.extracted_license_statement,
3110            Some("LGPL-2.1".to_string())
3111        );
3112        assert!(pkg.parties.len() >= 2);
3113        assert_eq!(pkg.parties[0].role, Some("copyright-holder".to_string()));
3114        assert!(pkg.parties[0].name.as_ref().unwrap().contains("Paul Moore"));
3115    }
3116
3117    #[test]
3118    fn test_parse_copyright_primary_license_detection_from_bsdutils_fixture() {
3119        let path = PathBuf::from(
3120            "testdata/debian-fixtures/debian-slim-2021-04-07/usr/share/doc/bsdutils/copyright",
3121        );
3122        let pkg = DebianCopyrightParser::extract_first_package(&path);
3123
3124        assert_eq!(pkg.name, Some("bsdutils".to_string()));
3125        let extracted = pkg
3126            .extracted_license_statement
3127            .as_deref()
3128            .expect("license statement should exist");
3129        assert!(extracted.contains("GPL-2+"));
3130        assert!(!pkg.license_detections.is_empty());
3131
3132        let primary = &pkg.license_detections[0];
3133        assert_eq!(
3134            primary.matches[0].matched_text.as_deref(),
3135            Some("License: GPL-2+")
3136        );
3137        assert_eq!(primary.matches[0].start_line, 47);
3138        assert_eq!(primary.matches[0].end_line, 47);
3139    }
3140
3141    #[test]
3142    fn test_parse_copyright_emits_ordered_absolute_case_preserved_detections() {
3143        let path = PathBuf::from("testdata/debian/copyright/copyright");
3144        let pkg = DebianCopyrightParser::extract_first_package(&path);
3145
3146        assert_eq!(pkg.license_detections.len(), 1);
3147        assert_eq!(pkg.other_license_detections.len(), 4);
3148
3149        let primary = &pkg.license_detections[0];
3150        assert_eq!(
3151            primary.matches[0].matched_text.as_deref(),
3152            Some("License: LGPL-2.1")
3153        );
3154        assert_eq!(primary.matches[0].start_line, 11);
3155
3156        let ordered_lines: Vec<usize> = pkg
3157            .other_license_detections
3158            .iter()
3159            .map(|detection| detection.matches[0].start_line)
3160            .collect();
3161        assert_eq!(ordered_lines, vec![15, 19, 23, 25]);
3162
3163        let ordered_texts: Vec<&str> = pkg
3164            .other_license_detections
3165            .iter()
3166            .map(|detection| detection.matches[0].matched_text.as_deref().unwrap())
3167            .collect();
3168        assert_eq!(
3169            ordered_texts,
3170            vec![
3171                "License: LGPL-2.1",
3172                "License: LGPL-2.1",
3173                "License: LGPL-2.1",
3174                "License: LGPL-2.1",
3175            ]
3176        );
3177    }
3178
3179    #[test]
3180    fn test_parse_copyright_detects_bottom_standalone_license_paragraph() {
3181        let path = PathBuf::from(
3182            "testdata/debian-fixtures/debian-2019-11-15/main/c/clamav/stable_copyright",
3183        );
3184        let pkg = DebianCopyrightParser::extract_first_package(&path);
3185
3186        let zlib = pkg
3187            .other_license_detections
3188            .iter()
3189            .find(|detection| detection.matches[0].matched_text.as_deref() == Some("License: Zlib"))
3190            .expect("at least one Zlib license paragraph should be detected");
3191        assert_eq!(
3192            zlib.matches[0].matched_text.as_deref(),
3193            Some("License: Zlib")
3194        );
3195
3196        let last_zlib = pkg
3197            .other_license_detections
3198            .iter()
3199            .rev()
3200            .find(|detection| detection.matches[0].matched_text.as_deref() == Some("License: Zlib"))
3201            .expect("bottom standalone Zlib license paragraph should be detected");
3202        assert_eq!(last_zlib.matches[0].start_line, 732);
3203        assert_eq!(last_zlib.matches[0].end_line, 732);
3204    }
3205
3206    #[test]
3207    fn test_parse_copyright_uses_header_paragraph_as_primary_when_files_star_is_blank() {
3208        let path =
3209            PathBuf::from("testdata/debian-fixtures/crafted_for_tests/test_license_nameless");
3210        let pkg = DebianCopyrightParser::extract_first_package(&path);
3211
3212        assert_eq!(pkg.license_detections.len(), 1);
3213        let primary = &pkg.license_detections[0];
3214        assert_eq!(
3215            primary.matches[0].matched_text.as_deref(),
3216            Some("License: LGPL-3+ or GPL-2+")
3217        );
3218        assert_eq!(primary.matches[0].start_line, 8);
3219        assert_eq!(primary.matches[0].end_line, 8);
3220
3221        assert!(pkg.other_license_detections.iter().any(|detection| {
3222            detection.matches[0].matched_text.as_deref() == Some("License: GPL-2+")
3223        }));
3224    }
3225
3226    #[test]
3227    fn test_parse_copyright_prefers_files_star_primary_over_header_paragraph() {
3228        let content = "Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/\nUpstream-Name: foo\nLicense: MIT\n\nFiles: *\nCopyright: 2024 Example\nLicense: GPL-2+\n";
3229        let pkg = parse_copyright_file(content, Some("foo"));
3230
3231        assert_eq!(pkg.license_detections.len(), 1);
3232        let primary = &pkg.license_detections[0];
3233        assert_eq!(
3234            primary.matches[0].matched_text.as_deref(),
3235            Some("License: GPL-2+")
3236        );
3237        assert_eq!(primary.matches[0].start_line, 7);
3238    }
3239
3240    #[test]
3241    fn test_finalize_copyright_paragraph_matches_rfc822_headers_and_license_line() {
3242        let raw_lines = vec![
3243            "Files: *".to_string(),
3244            "Copyright: 2024 Example Org".to_string(),
3245            "License: Apache-2.0".to_string(),
3246            " Licensed under the Apache License, Version 2.0.".to_string(),
3247        ];
3248
3249        let paragraph = finalize_copyright_paragraph(raw_lines.clone(), 10);
3250        let expected = rfc822::parse_rfc822_paragraphs(&raw_lines.join("\n"))
3251            .into_iter()
3252            .next()
3253            .expect("reference RFC822 paragraph should parse");
3254
3255        assert_eq!(paragraph.metadata.headers, expected.headers);
3256        assert_eq!(paragraph.metadata.body, expected.body);
3257        assert_eq!(
3258            paragraph.license_header_line,
3259            Some(("License: Apache-2.0".to_string(), 12))
3260        );
3261    }
3262
3263    #[test]
3264    fn test_parse_copyright_unstructured() {
3265        let content = "This package was debianized by John Doe.
3266
3267Upstream Authors:
3268    Jane Smith
3269
3270Copyright:
3271    2009 10gen
3272
3273License:
3274    SSPL
3275";
3276        let pkg = parse_copyright_file(content, Some("mongodb"));
3277        assert_eq!(pkg.name, Some("mongodb".to_string()));
3278        assert_eq!(pkg.extracted_license_statement, Some("SSPL".to_string()));
3279        assert!(!pkg.parties.is_empty());
3280    }
3281
3282    #[test]
3283    fn test_parse_copyright_holders() {
3284        let text = "2012 Paul Moore <pmoore@redhat.com>
32852012 Ashley Lai <adlai@us.ibm.com>
3286Copyright (C) 2015-2018 Example Corp";
3287        let holders = parse_copyright_holders(text);
3288        assert!(holders.len() >= 3);
3289        assert!(holders.iter().any(|h| h.contains("Paul Moore")));
3290        assert!(holders.iter().any(|h| h.contains("Example Corp")));
3291    }
3292
3293    #[test]
3294    fn test_parse_copyright_empty() {
3295        let content = "This is just some text without proper copyright info.";
3296        let pkg = parse_copyright_file(content, Some("test"));
3297        assert_eq!(pkg.name, Some("test".to_string()));
3298        assert!(pkg.parties.is_empty());
3299        assert!(pkg.extracted_license_statement.is_none());
3300    }
3301
3302    #[test]
3303    fn test_deb_parser_is_match() {
3304        assert!(DebianDebParser::is_match(&PathBuf::from("package.deb")));
3305        assert!(DebianDebParser::is_match(&PathBuf::from(
3306            "libapache2-mod-md_2.4.38-3+deb10u10_amd64.deb"
3307        )));
3308        assert!(!DebianDebParser::is_match(&PathBuf::from("package.tar.gz")));
3309        assert!(!DebianDebParser::is_match(&PathBuf::from("control")));
3310    }
3311
3312    #[test]
3313    fn test_parse_deb_filename_with_arch() {
3314        let pkg = parse_deb_filename("libapache2-mod-md_2.4.38-3+deb10u10_amd64.deb");
3315        assert_eq!(pkg.name, Some("libapache2-mod-md".to_string()));
3316        assert_eq!(pkg.version, Some("2.4.38-3+deb10u10".to_string()));
3317        assert_eq!(pkg.namespace, Some("debian".to_string()));
3318        assert_eq!(
3319            pkg.purl,
3320            Some("pkg:deb/debian/libapache2-mod-md@2.4.38-3%2Bdeb10u10?arch=amd64".to_string())
3321        );
3322        assert_eq!(pkg.datasource_id, Some(DatasourceId::DebianDeb));
3323    }
3324
3325    #[test]
3326    fn test_parse_deb_filename_without_arch() {
3327        let pkg = parse_deb_filename("package_1.0-1_all.deb");
3328        assert_eq!(pkg.name, Some("package".to_string()));
3329        assert_eq!(pkg.version, Some("1.0-1".to_string()));
3330        assert!(pkg.purl.as_ref().unwrap().contains("arch=all"));
3331    }
3332
3333    #[test]
3334    fn test_extract_deb_archive() {
3335        let test_path = PathBuf::from("testdata/debian/deb/adduser_3.112ubuntu1_all.deb");
3336        if !test_path.exists() {
3337            return;
3338        }
3339
3340        let pkg = DebianDebParser::extract_first_package(&test_path);
3341
3342        assert_eq!(pkg.name, Some("adduser".to_string()));
3343        assert_eq!(pkg.version, Some("3.112ubuntu1".to_string()));
3344        assert_eq!(pkg.namespace, Some("ubuntu".to_string()));
3345        assert!(pkg.description.is_some());
3346        assert!(!pkg.parties.is_empty());
3347
3348        assert!(pkg.purl.as_ref().unwrap().contains("adduser"));
3349        assert!(pkg.purl.as_ref().unwrap().contains("3.112ubuntu1"));
3350    }
3351
3352    #[test]
3353    fn test_extract_deb_archive_with_control_tar_xz() {
3354        let deb = create_synthetic_deb_with_control_tar_xz();
3355
3356        let pkg = DebianDebParser::extract_first_package(deb.path());
3357
3358        assert_eq!(pkg.name, Some("synthetic".to_string()));
3359        assert_eq!(pkg.version, Some("1.2.3".to_string()));
3360        assert_eq!(pkg.description, Some("Synthetic deb".to_string()));
3361        assert_eq!(pkg.homepage_url, Some("https://example.com".to_string()));
3362    }
3363
3364    #[test]
3365    fn test_extract_deb_archive_collects_embedded_copyright_metadata() {
3366        let deb = create_synthetic_deb_with_copyright();
3367
3368        let pkg = DebianDebParser::extract_first_package(deb.path());
3369
3370        assert_eq!(pkg.name, Some("synthetic".to_string()));
3371        assert_eq!(
3372            pkg.extracted_license_statement,
3373            Some("Apache-2.0".to_string())
3374        );
3375        assert!(pkg.parties.iter().any(|party| {
3376            party.role.as_deref() == Some("copyright-holder")
3377                && party.name.as_deref() == Some("Example Org")
3378        }));
3379    }
3380
3381    #[test]
3382    fn test_parse_deb_filename_simple() {
3383        let pkg = parse_deb_filename("adduser_3.112ubuntu1_all.deb");
3384        assert_eq!(pkg.name, Some("adduser".to_string()));
3385        assert_eq!(pkg.version, Some("3.112ubuntu1".to_string()));
3386        assert_eq!(pkg.namespace, Some("debian".to_string()));
3387    }
3388
3389    #[test]
3390    fn test_parse_deb_filename_invalid() {
3391        let pkg = parse_deb_filename("invalid.deb");
3392        assert!(pkg.name.is_none());
3393        assert!(pkg.version.is_none());
3394    }
3395
3396    #[test]
3397    fn test_distroless_parser() {
3398        let test_file = PathBuf::from("testdata/debian/var/lib/dpkg/status.d/base-files");
3399
3400        assert!(DebianDistrolessInstalledParser::is_match(&test_file));
3401
3402        if !test_file.exists() {
3403            eprintln!("Warning: Test file not found, skipping test");
3404            return;
3405        }
3406
3407        let pkg = DebianDistrolessInstalledParser::extract_first_package(&test_file);
3408
3409        assert_eq!(pkg.package_type, Some(PackageType::Deb));
3410        assert_eq!(
3411            pkg.datasource_id,
3412            Some(DatasourceId::DebianDistrolessInstalledDb)
3413        );
3414        assert_eq!(pkg.name, Some("base-files".to_string()));
3415        assert_eq!(pkg.version, Some("11.1+deb11u8".to_string()));
3416        assert_eq!(pkg.namespace, Some("debian".to_string()));
3417        assert!(pkg.purl.is_some());
3418        assert!(
3419            pkg.purl
3420                .as_ref()
3421                .unwrap()
3422                .contains("pkg:deb/debian/base-files")
3423        );
3424    }
3425}