nabla_cli/binary/
metadata_extractor.rs

1use regex::Regex;
2use serde::{Deserialize, Serialize};
3use std::collections::HashSet;
4
5use goblin::{elf::Elf, pe::PE};
6use wasmparser::{Parser, Payload};
7
8#[derive(Debug, Serialize, Deserialize, Clone)]
9pub struct VersionInfo {
10    pub version_strings: Vec<String>,
11    pub file_version: Option<String>,
12    pub product_version: Option<String>,
13    pub company: Option<String>,
14    pub product_name: Option<String>,
15    pub confidence: f64,
16}
17
18#[derive(Debug, Serialize, Deserialize, Clone)]
19pub struct LicenseInfo {
20    pub licenses: Vec<String>,
21    pub copyright_notices: Vec<String>,
22    pub spdx_identifiers: Vec<String>,
23    pub license_texts: Vec<String>,
24    pub confidence: f64,
25}
26
27pub fn extract_version_info(contents: &[u8], strings: &[String], format: &str) -> VersionInfo {
28    let mut version_strings = HashSet::new();
29    let mut file_version = None;
30    let mut product_version = None;
31    let mut company = None;
32    let mut product_name = None;
33
34    let version_patterns = [
35        Regex::new(r"\b(\d+\.\d+\.\d+(?:\.\d+)?)\b").unwrap(),
36        Regex::new(r"\bv(\d+\.\d+\.\d+(?:\.\d+)?)\b").unwrap(),
37        Regex::new(r"\bversion\s*[:=]\s*([^\s,;]+)").unwrap(),
38        Regex::new(r"\bVERSION\s*[:=]\s*([^\s,;]+)").unwrap(),
39        Regex::new(r"\b(\d+\.\d+(?:\.\d+)?(?:\.\d+)?)\b").unwrap(),
40    ];
41
42    for string in strings {
43        for pattern in &version_patterns {
44            for captures in pattern.captures_iter(string) {
45                if let Some(version) = captures.get(1) {
46                    if is_valid_version(version.as_str()) {
47                        version_strings.insert(version.as_str().to_string());
48                    }
49                }
50            }
51        }
52
53        if company.is_none() {
54            if let Some(comp) = extract_company_name(string) {
55                company = Some(comp);
56            }
57        }
58
59        if product_name.is_none() {
60            if let Some(prod) = extract_product_name(string) {
61                product_name = Some(prod);
62            }
63        }
64    }
65
66    match format {
67        "application/x-msdownload" => {
68            if let Some(pe_version) = extract_pe_version_info(contents) {
69                file_version = file_version.or(pe_version.file_version);
70                product_version = product_version.or(pe_version.product_version);
71                company = company.or(pe_version.company);
72                product_name = product_name.or(pe_version.product_name);
73            }
74        }
75        "application/x-elf" => {
76            if let Some(elf_versions) = extract_elf_version_info(contents) {
77                version_strings.extend(elf_versions);
78            }
79        }
80        "application/x-mach-binary" => {
81            if let Some(macho_versions) = extract_macho_version_info(contents) {
82                version_strings.extend(macho_versions);
83            }
84        }
85        "application/wasm" => {
86            if let Some(wasm_versions) = extract_wasm_version_info(contents) {
87                version_strings.extend(wasm_versions);
88            }
89        }
90        _ => {}
91    }
92
93    if file_version.is_none() && !version_strings.is_empty() {
94        file_version = version_strings
95            .iter()
96            .max_by_key(|v| v.matches('.').count())
97            .cloned();
98    }
99
100    let confidence = calculate_version_confidence(&version_strings, &file_version);
101
102    VersionInfo {
103        version_strings: version_strings.into_iter().collect(),
104        file_version,
105        product_version,
106        company,
107        product_name,
108        confidence,
109    }
110}
111
112pub fn extract_license_info(strings: &[String]) -> LicenseInfo {
113    let mut licenses = HashSet::new();
114    let mut copyright_notices = Vec::new();
115    let mut spdx_identifiers = HashSet::new();
116    let mut license_texts = Vec::new();
117
118    let license_patterns = [
119        (
120            Regex::new(r"(?i)\b(MIT|BSD|GPL|LGPL|Apache|Mozilla|ISC|Unlicense)\b").unwrap(),
121            "identifier",
122        ),
123        (
124            Regex::new(r"(?i)licensed under the ([^.,;]+)").unwrap(),
125            "phrase",
126        ),
127        (
128            Regex::new(r"(?i)license:\s*([^.,;\n]+)").unwrap(),
129            "declaration",
130        ),
131        (Regex::new(r"(?i)copyright\s+.*").unwrap(), "copyright"),
132        (
133            Regex::new(r"SPDX-License-Identifier:\s*([^\s]+)").unwrap(),
134            "spdx",
135        ),
136    ];
137
138    let license_text_patterns = [
139        Regex::new(r"(?i)permission is hereby granted.*").unwrap(),
140        Regex::new(r"(?i)redistribution and use in source and binary forms.*").unwrap(),
141        Regex::new(r"(?i)this program is free software.*").unwrap(),
142        Regex::new(r"(?i)licensed under the apache license.*").unwrap(),
143    ];
144
145    for string in strings {
146        if string.len() < 10 {
147            continue;
148        }
149
150        for (pattern, kind) in &license_patterns {
151            for captures in pattern.captures_iter(string) {
152                match *kind {
153                    "identifier" | "phrase" | "declaration" => {
154                        if let Some(license) = captures.get(1) {
155                            let license_str = normalize_license_name(license.as_str());
156                            if !license_str.is_empty() {
157                                licenses.insert(license_str);
158                            }
159                        }
160                    }
161                    "copyright" => {
162                        copyright_notices.push(string.clone());
163                    }
164                    "spdx" => {
165                        if let Some(spdx) = captures.get(1) {
166                            spdx_identifiers.insert(spdx.as_str().to_string());
167                        }
168                    }
169                    _ => {}
170                }
171            }
172        }
173
174        for pattern in &license_text_patterns {
175            if pattern.is_match(string) && string.len() > 100 {
176                license_texts.push(string.clone());
177                if let Some(inferred) = infer_license_from_text(string) {
178                    licenses.insert(inferred);
179                }
180            }
181        }
182    }
183
184    let confidence = calculate_license_confidence(&licenses, &spdx_identifiers, &license_texts);
185
186    LicenseInfo {
187        licenses: licenses.into_iter().collect(),
188        copyright_notices,
189        spdx_identifiers: spdx_identifiers.into_iter().collect(),
190        license_texts,
191        confidence,
192    }
193}
194
195pub fn is_valid_version(version: &str) -> bool {
196    if version.len() < 3 || version.len() > 20 || !version.contains('.') {
197        return false;
198    }
199
200    let parts: Vec<&str> = version.split('.').collect();
201    if parts.len() > 5 {
202        return false;
203    }
204
205    for part in parts {
206        if let Ok(num) = part.parse::<u32>() {
207            if num > 9999 {
208                return false;
209            }
210        }
211    }
212
213    true
214}
215
216pub fn extract_company_name(string: &str) -> Option<String> {
217    let patterns = [
218        Regex::new(r"(?i)company:\s*([^.,;\n]+)").unwrap(),
219        Regex::new(r"(?i)corporation:\s*([^.,;\n]+)").unwrap(),
220        Regex::new(r"(?i)© \d{4}\s+([^.,;\n]+)").unwrap(),
221        Regex::new(
222            r"(?i)copyright.*?(\w+(?:\s+\w+){0,3})(?:\s+inc\.?|\s+corp\.?|\s+ltd\.?|\s+llc)",
223        )
224        .unwrap(),
225    ];
226
227    for pattern in &patterns {
228        if let Some(caps) = pattern.captures(string) {
229            if let Some(m) = caps.get(1) {
230                let s = m.as_str().trim();
231                if s.len() > 2 && s.len() < 100 {
232                    return Some(s.to_string());
233                }
234            }
235        }
236    }
237    None
238}
239
240pub fn extract_product_name(string: &str) -> Option<String> {
241    let patterns = [
242        Regex::new(r"(?i)product:\s*([^.,;\n]+)").unwrap(),
243        Regex::new(r"(?i)application:\s*([^.,;\n]+)").unwrap(),
244        Regex::new(r"(?i)program:\s*([^.,;\n]+)").unwrap(),
245    ];
246
247    for pattern in &patterns {
248        if let Some(caps) = pattern.captures(string) {
249            if let Some(m) = caps.get(1) {
250                let s = m.as_str().trim();
251                if s.len() > 2 && s.len() < 100 {
252                    return Some(s.to_string());
253                }
254            }
255        }
256    }
257    None
258}
259
260pub fn normalize_license_name(license: &str) -> String {
261    match license.to_lowercase().as_str() {
262        "mit" => "MIT".to_string(),
263        "bsd" => "BSD".to_string(),
264        "gpl" => "GPL".to_string(),
265        "lgpl" => "LGPL".to_string(),
266        "apache" => "Apache-2.0".to_string(),
267        "mozilla" => "MPL-2.0".to_string(),
268        "isc" => "ISC".to_string(),
269        "unlicense" => "Unlicense".to_string(),
270        other => other.to_string(),
271    }
272}
273
274pub fn infer_license_from_text(text: &str) -> Option<String> {
275    let t = text.to_lowercase();
276    if t.contains("permission is hereby granted") && t.contains("mit") {
277        Some("MIT".to_string())
278    } else if t.contains("redistribution and use in source and binary forms") {
279        Some("BSD".to_string())
280    } else if t.contains("apache license") {
281        Some("Apache-2.0".to_string())
282    } else if t.contains("gnu general public license") {
283        Some("GPL".to_string())
284    } else {
285        None
286    }
287}
288
289pub fn calculate_version_confidence(
290    version_strings: &HashSet<String>,
291    file_version: &Option<String>,
292) -> f64 {
293    let mut confidence: f64 = 0.0;
294    if !version_strings.is_empty() {
295        confidence += 0.3;
296    }
297    if file_version.is_some() {
298        confidence += 0.4;
299    }
300    if version_strings.len() == 1 {
301        confidence += 0.3;
302    } else if version_strings.len() > 1 {
303        confidence += 0.1;
304    }
305    confidence.min(1.0)
306}
307
308pub fn calculate_license_confidence(
309    licenses: &HashSet<String>,
310    spdx: &HashSet<String>,
311    texts: &[String],
312) -> f64 {
313    let mut confidence: f64 = 0.0;
314    if !spdx.is_empty() {
315        confidence += 0.5;
316    }
317    if !licenses.is_empty() {
318        confidence += 0.3;
319    }
320    if !texts.is_empty() {
321        confidence += 0.2;
322    }
323    confidence.min(1.0)
324}
325
326// -----------------------------------------
327// Format-specific extractors below
328// -----------------------------------------
329
330#[derive(Debug)]
331pub struct PeVersionInfo {
332    file_version: Option<String>,
333    product_version: Option<String>,
334    company: Option<String>,
335    product_name: Option<String>,
336}
337
338pub fn extract_pe_version_info(contents: &[u8]) -> Option<PeVersionInfo> {
339    // Use goblin to parse PE headers and extract basic version info from optional header
340    if let Ok(pe) = PE::parse(contents) {
341        if let Some(ref opt_header) = pe.header.optional_header {
342            let windows = &opt_header.windows_fields;
343
344            // File version: image version fields (if non-zero)
345            let file_version =
346                if windows.major_image_version != 0 || windows.minor_image_version != 0 {
347                    Some(format!(
348                        "{}.{}",
349                        windows.major_image_version, windows.minor_image_version
350                    ))
351                } else {
352                    None
353                };
354
355            // Product version: subsystem version fields (if non-zero)
356            let product_version =
357                if windows.major_subsystem_version != 0 || windows.minor_subsystem_version != 0 {
358                    Some(format!(
359                        "{}.{}",
360                        windows.major_subsystem_version, windows.minor_subsystem_version
361                    ))
362                } else {
363                    None
364                };
365
366            return Some(PeVersionInfo {
367                file_version,
368                product_version,
369                company: None,      // Not available from headers
370                product_name: None, // Not available from headers
371            });
372        }
373    }
374    None
375}
376
377pub fn extract_elf_version_info(contents: &[u8]) -> Option<Vec<String>> {
378    if let Ok(elf) = Elf::parse(contents) {
379        let mut versions = Vec::new();
380        if let Some(note_iter) = elf.iter_note_headers(contents) {
381            for note_result in note_iter {
382                if let Ok(n) = note_result {
383                    if n.name == "GNU" && n.n_type == goblin::elf::note::NT_GNU_BUILD_ID {
384                        let hex = n
385                            .desc
386                            .iter()
387                            .map(|b| format!("{:02x}", b))
388                            .collect::<String>();
389                        versions.push(hex);
390                    }
391                }
392            }
393        }
394        Some(versions)
395    } else {
396        None
397    }
398}
399
400pub fn extract_macho_version_info(_contents: &[u8]) -> Option<Vec<String>> {
401    None // advanced Mach-O version extraction not implemented yet
402}
403
404pub fn extract_wasm_version_info(contents: &[u8]) -> Option<Vec<String>> {
405    let mut versions = Vec::new();
406    let parser = Parser::new(0);
407    for payload in parser.parse_all(contents) {
408        if let Ok(Payload::CustomSection(s)) = payload {
409            if s.name().contains("version") || s.name().contains("meta") {
410                let text = String::from_utf8_lossy(s.data());
411                for line in text.lines() {
412                    if let Some(v) = line.split_whitespace().find(|w| is_valid_version(w)) {
413                        versions.push(v.to_string());
414                    }
415                }
416            }
417        }
418    }
419    Some(versions)
420}