nabla_cli/binary/
binary_analysis.rs

1use super::{BinaryAnalysis, extract_license_info, extract_version_info};
2use crate::enterprise::crypto::CryptoProvider;
3use chrono::Utc;
4use goblin::{
5    Object as GoblinObject,
6    elf::Elf,
7    mach::{MachO, load_command::CommandVariant},
8    pe::PE,
9};
10use infer;
11use object::{Object, ObjectSymbol};
12use sha2::{Digest, Sha256};
13use std::collections::HashSet;
14use uuid::Uuid;
15use wasmparser::{Parser, Payload};
16
17pub async fn analyze_binary(
18    file_name: &str,
19    contents: &[u8],
20    crypto_provider: &CryptoProvider,
21) -> anyhow::Result<BinaryAnalysis> {
22    tracing::info!(
23        "Starting binary analysis for '{}' ({} bytes)",
24        file_name,
25        contents.len()
26    );
27
28    // Early validation for very small files
29    if contents.len() < 50 {
30        tracing::warn!(
31            "File is very small ({} bytes), likely not a binary executable",
32            contents.len()
33        );
34        return analyze_small_file(file_name, contents, crypto_provider);
35    }
36
37    let sha256_hash = Sha256::digest(contents);
38    let alternative_hash = crypto_provider.hash_alternative(contents)?;
39
40    // Detect file type with more detailed logging
41    let detected_type = infer::get(contents);
42    let file_type = if let Some(kind) = detected_type {
43        tracing::info!(
44            "Detected file type: {} ({})",
45            kind.mime_type(),
46            kind.extension()
47        );
48        kind.mime_type().to_string()
49    } else {
50        tracing::debug!("Could not detect file type, using fallback");
51        detect_file_type_fallback(file_name, contents)
52    };
53
54    let mut analysis = BinaryAnalysis {
55        id: Uuid::new_v4(),
56        file_name: file_name.to_string(),
57        format: file_type.clone(),
58        architecture: "unknown".to_string(),
59        languages: Vec::new(),
60        detected_symbols: Vec::new(),
61        embedded_strings: extract_strings(contents),
62        suspected_secrets: Vec::new(),
63        imports: Vec::new(),
64        exports: Vec::new(),
65        hash_sha256: format!("{:x}", sha256_hash),
66        hash_blake3: Some(hex::encode(&alternative_hash)),
67        size_bytes: contents.len() as u64,
68        linked_libraries: Vec::new(),
69        static_linked: false,
70        version_info: None,
71        license_info: None,
72        metadata: serde_json::json!({
73            "fips_mode": crypto_provider.fips_enabled,
74            "hash_algorithm": if crypto_provider.fips_enabled { "SHA-512" } else { "Blake3" }
75        }),
76        created_at: Utc::now(),
77        sbom: None,
78    };
79
80    // Try different parsing strategies based on file type and magic bytes
81    let mut parsed_successfully = false;
82
83    if contents.len() >= 4 {
84        match &contents[0..4] {
85            [0x7f, b'E', b'L', b'F'] => {
86                tracing::info!("ELF magic detected, using goblin ELF parser");
87                if let Ok(GoblinObject::Elf(elf)) = GoblinObject::parse(contents) {
88                    analyze_elf(&mut analysis, &elf, contents)?;
89                    parsed_successfully = true;
90                }
91            }
92            [b'M', b'Z', _, _] => {
93                tracing::info!("PE magic detected, using goblin PE parser");
94                if let Ok(GoblinObject::PE(pe)) = GoblinObject::parse(contents) {
95                    analyze_pe(&mut analysis, &pe, contents)?;
96                    parsed_successfully = true;
97                }
98            }
99            [0xfe, 0xed, 0xfa, 0xce] | [0xce, 0xfa, 0xed, 0xfe] => {
100                tracing::info!("Mach-O magic detected, using goblin Mach-O parser");
101                if let Ok(GoblinObject::Mach(mach)) = GoblinObject::parse(contents) {
102                    match mach {
103                        goblin::mach::Mach::Fat(_) => {
104                            analysis.format = "macho-fat".to_string();
105                            analysis.architecture = "multi".to_string();
106                        }
107                        goblin::mach::Mach::Binary(macho) => {
108                            analyze_macho(&mut analysis, &macho, contents)?
109                        }
110                    }
111                    parsed_successfully = true;
112                }
113            }
114            [0x00, 0x61, 0x73, 0x6d] => {
115                tracing::info!("WASM magic detected, using wasmparser");
116                if analyze_wasm(&mut analysis, contents).is_ok() {
117                    parsed_successfully = true;
118                }
119            }
120            _ => {}
121        }
122    }
123
124    if !parsed_successfully {
125        tracing::debug!("No specific magic bytes found, attempting generic goblin parsing...");
126        match GoblinObject::parse(contents) {
127            Ok(obj) => {
128                tracing::info!("Successfully parsed with goblin (generic)");
129                match obj {
130                    GoblinObject::Elf(elf) => {
131                        tracing::info!("Detected ELF binary (generic)");
132                        analyze_elf(&mut analysis, &elf, contents)?;
133                        parsed_successfully = true;
134                    }
135                    GoblinObject::PE(pe) => {
136                        tracing::info!("Detected PE binary (generic)");
137                        analyze_pe(&mut analysis, &pe, contents)?;
138                        parsed_successfully = true;
139                    }
140                    GoblinObject::Mach(mach) => {
141                        tracing::info!("Detected Mach-O binary (generic)");
142                        match mach {
143                            goblin::mach::Mach::Fat(_) => {
144                                analysis.format = "macho-fat".to_string();
145                                analysis.architecture = "multi".to_string();
146                            }
147                            goblin::mach::Mach::Binary(macho) => {
148                                analyze_macho(&mut analysis, &macho, contents)?
149                            }
150                        }
151                        parsed_successfully = true;
152                    }
153                    GoblinObject::Archive(_) => {
154                        tracing::info!("Detected archive");
155                        analysis.format = "archive".to_string();
156                        parsed_successfully = true;
157                    }
158                    _ => {
159                        tracing::debug!("Unknown goblin object type");
160                    }
161                }
162            }
163            Err(e) => {
164                tracing::debug!("Goblin parsing failed: {}, trying WebAssembly", e);
165                if analyze_wasm(&mut analysis, contents).is_ok() {
166                    tracing::info!("Successfully parsed as WebAssembly");
167                    parsed_successfully = true;
168                }
169            }
170        }
171    }
172
173    if !parsed_successfully {
174        tracing::info!("All specialized parsers failed, using generic analysis");
175        analyze_unknown_binary(&mut analysis, contents)?;
176    } else {
177        tracing::info!("Successfully analyzed {} as {}", file_name, analysis.format);
178    }
179
180    // Extract version and license information
181    tracing::debug!("Extracting version and license metadata");
182    analysis.version_info = Some(extract_version_info(
183        contents,
184        &analysis.embedded_strings,
185        &analysis.format,
186    ));
187    analysis.license_info = Some(extract_license_info(&analysis.embedded_strings));
188
189    tracing::info!(
190        "Metadata extraction complete: version_confidence={:.2}, license_confidence={:.2}",
191        analysis
192            .version_info
193            .as_ref()
194            .map(|v| v.confidence)
195            .unwrap_or(0.0),
196        analysis
197            .license_info
198            .as_ref()
199            .map(|l| l.confidence)
200            .unwrap_or(0.0)
201    );
202
203    Ok(analysis)
204}
205
206fn analyze_macho(
207    analysis: &mut BinaryAnalysis,
208    macho: &MachO,
209    contents: &[u8],
210) -> anyhow::Result<()> {
211    analysis.format = "macho".to_string();
212
213    // Determine architecture
214    analysis.architecture = match macho.header.cputype() {
215        goblin::mach::constants::cputype::CPU_TYPE_X86_64 => "x86_64".to_string(),
216        goblin::mach::constants::cputype::CPU_TYPE_ARM64 => "aarch64".to_string(),
217        goblin::mach::constants::cputype::CPU_TYPE_X86 => "i386".to_string(),
218        _ => format!("unknown({})", macho.header.cputype()),
219    };
220
221    // Extract symbols (both regular and dynamic)
222    let mut symbol_set = HashSet::new();
223    if let Some(symbols) = &macho.symbols {
224        for symbol in symbols.iter() {
225            if let Ok((name, _)) = symbol {
226                if !name.is_empty() {
227                    symbol_set.insert(name.to_string());
228                    analysis.detected_symbols.push(name.to_string());
229                }
230            }
231        }
232    }
233
234    // Extract libraries and frameworks
235    for lib in &macho.libs {
236        let lib_name = lib.to_string();
237        analysis.linked_libraries.push(lib_name.clone());
238        // Add to embedded strings for version extraction
239        analysis.embedded_strings.push(lib_name.clone());
240        // Extract potential version info from library name (e.g., libcrypto.1.1.dylib)
241        if let Some(version) = extract_version_from_lib_name(&lib_name) {
242            analysis.embedded_strings.push(version);
243        }
244    }
245
246    // Use object crate for detailed import/export analysis
247    if let Ok(obj_file) = object::File::parse(contents) {
248        for symbol in obj_file.symbols() {
249            if let Ok(name) = symbol.name() {
250                if !name.is_empty() {
251                    if symbol.is_undefined() {
252                        analysis.imports.push(name.to_string());
253                        analysis.embedded_strings.push(name.to_string());
254                    } else if symbol.is_global() {
255                        analysis.exports.push(name.to_string());
256                    }
257                    symbol_set.insert(name.to_string());
258                }
259            }
260        }
261    }
262
263    // Extract additional metadata from load commands
264    let mut metadata = serde_json::json!({
265        "analysis_type": "macho",
266        "load_commands": [],
267        "frameworks": [],
268        "min_os_version": null,
269    });
270
271    // Process load commands for frameworks and version info
272    for lc in macho.load_commands.iter() {
273        match lc.command {
274            CommandVariant::LoadDylib(ref dylib) => {
275                let offset = dylib.dylib.name as usize;
276                if offset < contents.len() {
277                    let name_bytes = &contents[offset..];
278                    if let Some(end) = name_bytes.iter().position(|&b| b == 0) {
279                        if let Ok(name_str) = std::str::from_utf8(&name_bytes[..end]) {
280                            if name_str.contains(".framework") {
281                                metadata["frameworks"]
282                                    .as_array_mut()
283                                    .unwrap()
284                                    .push(serde_json::Value::String(name_str.to_string()));
285                                analysis.embedded_strings.push(name_str.to_string());
286                            }
287                        }
288                    }
289                }
290            }
291            CommandVariant::VersionMinMacosx(ref ver) => {
292                let (major, minor) = unpack_version(ver.version);
293                metadata["min_os_version"] =
294                    serde_json::Value::String(format!("{}.{}", major, minor));
295            }
296            CommandVariant::BuildVersion(ref build) => {
297                let (major, minor) = unpack_version(build.minos);
298                metadata["min_os_version"] =
299                    serde_json::Value::String(format!("{}.{}", major, minor));
300            }
301            _ => {}
302        }
303        metadata["load_commands"]
304            .as_array_mut()
305            .unwrap()
306            .push(serde_json::Value::String(format!("{:?}", lc.command)));
307    }
308
309    // Detect static linking
310    analysis.static_linked = macho.libs.is_empty() && symbol_set.iter().any(|s| s.contains("main"));
311
312    // Extract potential CPE identifiers for CVE matching
313    let cpe_candidates = extract_cpe_candidates(
314        &analysis.linked_libraries,
315        &analysis.imports,
316        &analysis.detected_symbols,
317    );
318    analysis.metadata = serde_json::json!({
319        "macho_metadata": metadata,
320        "cpe_candidates": cpe_candidates,
321    });
322
323    tracing::info!(
324        "Mach-O analysis complete: {} symbols, {} libraries, {} imports, {} exports",
325        analysis.detected_symbols.len(),
326        analysis.linked_libraries.len(),
327        analysis.imports.len(),
328        analysis.exports.len()
329    );
330
331    Ok(())
332}
333
334// Helper function to extract version from library names
335fn extract_version_from_lib_name(lib_name: &str) -> Option<String> {
336    let parts: Vec<&str> = lib_name.split('.').collect();
337    for part in parts {
338        if part.chars().all(|c| c.is_digit(10) || c == '.') {
339            return Some(part.to_string());
340        }
341    }
342    None
343}
344
345// Helper function to unpack Mach-O version numbers (u32) into major and minor components
346fn unpack_version(version: u32) -> (u32, u32) {
347    let major = (version >> 16) & 0xFFFF;
348    let minor = (version >> 8) & 0xFF;
349    (major, minor)
350}
351
352// Helper function to generate CPE-like identifiers
353fn extract_cpe_candidates(libs: &[String], imports: &[String], symbols: &[String]) -> Vec<String> {
354    let mut cpes = HashSet::new();
355    for item in libs.iter().chain(imports.iter()).chain(symbols.iter()) {
356        let item_lower = item.to_lowercase();
357        // Example: Convert "libcrypto.1.1.dylib" to "cpe:2.3:a:openssl:openssl:1.1:*:*:*:*:*:*:*"
358        if item_lower.contains("openssl")
359            || item_lower.contains("libcrypto")
360            || item_lower.contains("libssl")
361        {
362            if let Some(version) = extract_version_from_lib_name(&item_lower) {
363                cpes.insert(format!(
364                    "cpe:2.3:a:openssl:openssl:{}:*:*:*:*:*:*:*",
365                    version
366                ));
367            } else {
368                cpes.insert("cpe:2.3:a:openssl:openssl:*:*:*:*:*:*:*:*".to_string());
369            }
370        }
371        // Add more CPE patterns for common libraries (e.g., zlib, curl)
372        if item_lower.contains("zlib") {
373            if let Some(version) = extract_version_from_lib_name(&item_lower) {
374                cpes.insert(format!("cpe:2.3:a:zlib:zlib:{}:*:*:*:*:*:*:*", version));
375            }
376        }
377        if item_lower.contains("curl") || item_lower.contains("libcurl") {
378            if let Some(version) = extract_version_from_lib_name(&item_lower) {
379                cpes.insert(format!("cpe:2.3:a:curl:curl:{}:*:*:*:*:*:*:*", version));
380            }
381        }
382    }
383    cpes.into_iter().collect()
384}
385
386fn analyze_elf(analysis: &mut BinaryAnalysis, elf: &Elf, contents: &[u8]) -> anyhow::Result<()> {
387    analysis.format = "elf".to_string();
388
389    // Determine architecture
390    analysis.architecture = match elf.header.e_machine {
391        goblin::elf::header::EM_X86_64 => "x86_64".to_string(),
392        goblin::elf::header::EM_386 => "i386".to_string(),
393        goblin::elf::header::EM_ARM => "arm".to_string(),
394        goblin::elf::header::EM_AARCH64 => "aarch64".to_string(),
395        goblin::elf::header::EM_RISCV => "riscv".to_string(),
396        _ => format!("unknown({})", elf.header.e_machine),
397    };
398
399    // Extract symbols
400    for sym in &elf.syms {
401        if let Some(name) = elf.strtab.get_at(sym.st_name) {
402            if !name.is_empty() {
403                analysis.detected_symbols.push(name.to_string());
404            }
405        }
406    }
407
408    // Extract dynamic symbols
409    for sym in &elf.dynsyms {
410        if let Some(name) = elf.dynstrtab.get_at(sym.st_name) {
411            if !name.is_empty() {
412                analysis.detected_symbols.push(name.to_string());
413            }
414        }
415    }
416
417    // Extract libraries
418    for lib in &elf.libraries {
419        analysis.linked_libraries.push(lib.to_string());
420        // Store library name for regex-based version extraction later
421        analysis.embedded_strings.push(lib.to_string());
422    }
423
424    // Determine if statically linked
425    analysis.static_linked =
426        elf.libraries.is_empty() && elf.header.e_type == goblin::elf::header::ET_EXEC;
427
428    // Extract imports/exports using object crate for more detailed analysis
429    if let Ok(obj_file) = object::File::parse(contents) {
430        for symbol in obj_file.symbols() {
431            if let Ok(name) = symbol.name() {
432                if symbol.is_undefined() {
433                    analysis.imports.push(name.to_string());
434                } else if symbol.is_global() {
435                    analysis.exports.push(name.to_string());
436                }
437            }
438        }
439    }
440
441    Ok(())
442}
443
444fn analyze_pe(analysis: &mut BinaryAnalysis, pe: &PE, _contents: &[u8]) -> anyhow::Result<()> {
445    analysis.format = "pe".to_string();
446
447    // Determine architecture
448    analysis.architecture = match pe.header.coff_header.machine {
449        goblin::pe::header::COFF_MACHINE_X86_64 => "x86_64".to_string(),
450        goblin::pe::header::COFF_MACHINE_X86 => "i386".to_string(),
451        goblin::pe::header::COFF_MACHINE_ARM64 => "aarch64".to_string(),
452        _ => format!("unknown({})", pe.header.coff_header.machine),
453    };
454
455    // Extract exports
456    for export in &pe.exports {
457        if let Some(name) = &export.name {
458            analysis.exports.push(name.to_string());
459        }
460    }
461
462    // Extract imports
463    for import in &pe.imports {
464        analysis.imports.push(import.name.to_string());
465        // Add import name to embedded strings for version extraction heuristics
466        analysis.embedded_strings.push(import.name.to_string());
467        if !analysis.linked_libraries.contains(&import.dll.to_string()) {
468            analysis.linked_libraries.push(import.dll.to_string());
469            // Include DLL name in embedded strings so version like "vcruntime140.dll" can be parsed
470            analysis.embedded_strings.push(import.dll.to_string());
471        }
472    }
473
474    // PE files are typically dynamically linked if they have imports
475    analysis.static_linked = pe.imports.is_empty();
476
477    Ok(())
478}
479
480fn analyze_wasm(analysis: &mut BinaryAnalysis, contents: &[u8]) -> anyhow::Result<()> {
481    tracing::info!("Starting WASM analysis");
482    analysis.format = "application/wasm".to_string();
483    analysis.architecture = "wasm32".to_string();
484    analysis.languages.push("WebAssembly".to_string());
485
486    let parser = Parser::new(0);
487    let mut imports = HashSet::new();
488    let mut exports = HashSet::new();
489    let mut function_count = 0;
490    let mut memory_info = Vec::new();
491    let mut table_info = Vec::new();
492
493    for payload in parser.parse_all(contents) {
494        use wasmparser::Payload as WasmPayload;
495        match payload {
496            Ok(payload) => {
497                match payload {
498                    Payload::Version { num, .. } => {
499                        tracing::debug!("WASM version: {}", num);
500                    }
501                    Payload::ImportSection(reader) => {
502                        for import in reader {
503                            match import {
504                                Ok(import) => {
505                                    let import_name = format!("{}::{}", import.module, import.name);
506                                    imports.insert(import_name);
507                                    tracing::debug!(
508                                        "Found import: {}::{}",
509                                        import.module,
510                                        import.name
511                                    );
512                                }
513                                Err(e) => tracing::warn!("Failed to parse import: {}", e),
514                            }
515                        }
516                    }
517                    Payload::ExportSection(reader) => {
518                        for export in reader {
519                            match export {
520                                Ok(export) => {
521                                    exports.insert(export.name.to_string());
522                                    tracing::debug!("Found export: {}", export.name);
523                                }
524                                Err(e) => tracing::warn!("Failed to parse export: {}", e),
525                            }
526                        }
527                    }
528                    Payload::FunctionSection(reader) => {
529                        function_count = reader.count();
530                        tracing::debug!("Function count: {}", function_count);
531                    }
532                    Payload::MemorySection(reader) => {
533                        for memory in reader {
534                            match memory {
535                                Ok(memory) => {
536                                    memory_info.push(format!(
537                                        "initial: {}, maximum: {:?}",
538                                        memory.initial, memory.maximum
539                                    ));
540                                }
541                                Err(e) => tracing::warn!("Failed to parse memory: {}", e),
542                            }
543                        }
544                    }
545                    Payload::TableSection(reader) => {
546                        for table in reader {
547                            match table {
548                                Ok(table) => {
549                                    table_info.push(format!(
550                                        "element_type: {:?}, initial: {}, maximum: {:?}",
551                                        table.ty.element_type, table.ty.initial, table.ty.maximum
552                                    ));
553                                }
554                                Err(e) => tracing::warn!("Failed to parse table: {}", e),
555                            }
556                        }
557                    }
558                    WasmPayload::CustomSection(custom) => {
559                        if let Ok(bytes_str) = std::str::from_utf8(custom.data()) {
560                            for s in extract_strings(bytes_str.as_bytes()) {
561                                analysis.embedded_strings.push(s);
562                            }
563                        }
564                    }
565                    Payload::TypeSection(reader) => {
566                        tracing::debug!("Type section with {} types", reader.count());
567                    }
568                    _ => {
569                        // tracing::debug!("Skipping WASM section: {:?}", payload);
570                    }
571                }
572            }
573            Err(e) => {
574                tracing::warn!("WASM parsing error: {}", e);
575                break;
576            }
577        }
578    }
579
580    analysis.imports = imports.into_iter().collect();
581    analysis.exports = exports.into_iter().collect();
582    analysis.static_linked = true; // WASM modules are self-contained
583
584    // Add WASM-specific metadata
585    analysis.metadata = serde_json::json!({
586        "wasm_version": "1.0",
587        "function_count": function_count,
588        "memory_sections": memory_info,
589        "table_sections": table_info,
590        "import_count": analysis.imports.len(),
591        "export_count": analysis.exports.len(),
592        "analysis_type": "wasm"
593    });
594
595    tracing::info!(
596        "WASM analysis complete: {} imports, {} exports, {} functions",
597        analysis.imports.len(),
598        analysis.exports.len(),
599        function_count
600    );
601
602    Ok(())
603}
604
605fn analyze_unknown_binary(analysis: &mut BinaryAnalysis, contents: &[u8]) -> anyhow::Result<()> {
606    tracing::debug!("Performing generic binary analysis");
607
608    // Try to determine if it's a text file
609    let text_ratio = contents
610        .iter()
611        .filter(|&&b| b.is_ascii_graphic() || b.is_ascii_whitespace())
612        .count() as f64
613        / contents.len() as f64;
614
615    if text_ratio > 0.7 {
616        analysis.format = "text".to_string();
617        tracing::debug!(
618            "Detected text file ({}% ASCII)",
619            (text_ratio * 100.0) as u32
620        );
621
622        // Try to extract more information from text files
623        let text = String::from_utf8_lossy(contents);
624        {
625            // Look for shebang
626            if text.starts_with("#!") {
627                analysis.format = "script".to_string();
628                analysis.languages.push("script".to_string());
629            }
630
631            // Look for common programming patterns
632            if text.contains("function") || text.contains("def ") {
633                analysis.languages.push("script".to_string());
634            }
635            if text.contains("#include") || text.contains("int main") {
636                analysis.languages.push("C/C++".to_string());
637            }
638            if text.contains("pub fn") || text.contains("fn main") {
639                analysis.languages.push("Rust".to_string());
640            }
641        }
642    } else {
643        analysis.format = "binary".to_string();
644        tracing::debug!(
645            "Detected binary file ({}% ASCII)",
646            (text_ratio * 100.0) as u32
647        );
648    }
649
650    analysis.architecture = "unknown".to_string();
651
652    // Add some basic metadata
653    analysis.metadata = serde_json::json!({
654        "ascii_ratio": text_ratio,
655        "analysis_type": "generic"
656    });
657
658    Ok(())
659}
660
661fn analyze_small_file(
662    file_name: &str,
663    contents: &[u8],
664    crypto_provider: &CryptoProvider,
665) -> anyhow::Result<BinaryAnalysis> {
666    tracing::info!(
667        "Analyzing small file '{}' ({} bytes)",
668        file_name,
669        contents.len()
670    );
671
672    let sha256_hash = Sha256::digest(contents);
673    let alternative_hash = crypto_provider.hash_alternative(contents)?;
674
675    // For small files, just extract strings and basic info
676    let strings = extract_strings(contents);
677    let text_content = String::from_utf8_lossy(contents);
678
679    // Check if it's mostly text
680    let text_ratio = contents
681        .iter()
682        .filter(|&&b| b.is_ascii_graphic() || b.is_ascii_whitespace())
683        .count() as f64
684        / contents.len() as f64;
685
686    let format = if text_ratio > 0.8 {
687        "text/plain"
688    } else {
689        "application/octet-stream"
690    }
691    .to_string();
692
693    // Try to determine what kind of small file this is
694    let mut languages = Vec::new();
695    let mut analysis_notes = Vec::new();
696
697    if strings.iter().any(|s| s.ends_with(".wasm")) {
698        analysis_notes.push("Contains WASM module reference".to_string());
699        languages.push("WebAssembly".to_string());
700    }
701
702    if strings
703        .iter()
704        .any(|s| s.ends_with(".dll") || s.ends_with(".exe"))
705    {
706        analysis_notes.push("Contains Windows executable reference".to_string());
707    }
708
709    if text_content.starts_with("#!") {
710        languages.push("Script".to_string());
711        analysis_notes.push("Shell script or executable script".to_string());
712    }
713
714    let metadata = serde_json::json!({
715        "ascii_ratio": text_ratio,
716        "analysis_type": "small_file",
717        "notes": analysis_notes,
718        "content_preview": text_content.chars().take(50).collect::<String>()
719    });
720
721    let version_info = extract_version_info(contents, &strings, &format);
722    let license_info = extract_license_info(&strings);
723
724    Ok(BinaryAnalysis {
725        id: Uuid::new_v4(),
726        file_name: file_name.to_string(),
727        format,
728        architecture: "n/a".to_string(),
729        languages,
730        detected_symbols: Vec::new(),
731        embedded_strings: strings,
732        suspected_secrets: Vec::new(),
733        imports: Vec::new(),
734        exports: Vec::new(),
735        hash_sha256: format!("{:x}", sha256_hash),
736        hash_blake3: Some(hex::encode(&alternative_hash)),
737        size_bytes: contents.len() as u64,
738        linked_libraries: Vec::new(),
739        static_linked: false,
740        version_info: Some(version_info),
741        license_info: Some(license_info),
742        metadata,
743        created_at: Utc::now(),
744        sbom: None,
745    })
746}
747
748fn detect_file_type_fallback(file_name: &str, contents: &[u8]) -> String {
749    // Check for common magic bytes
750    if contents.len() >= 4 {
751        match &contents[0..4] {
752            [0x7f, b'E', b'L', b'F'] => return "application/x-elf".to_string(),
753            [b'M', b'Z', _, _] => return "application/x-msdownload".to_string(), // PE
754            [0xfe, 0xed, 0xfa, 0xce] | [0xce, 0xfa, 0xed, 0xfe] => {
755                return "application/x-mach-binary".to_string();
756            }
757            [0x00, 0x61, 0x73, 0x6d] => return "application/wasm".to_string(), // WASM
758            _ => {}
759        }
760    }
761
762    // Check file extension
763    if let Some(ext) = file_name.split('.').last() {
764        match ext.to_lowercase().as_str() {
765            "exe" | "dll" => return "application/x-msdownload".to_string(),
766            "so" | "a" => return "application/x-sharedlib".to_string(),
767            "wasm" => return "application/wasm".to_string(),
768            "bin" => return "application/octet-stream".to_string(),
769            _ => {}
770        }
771    }
772
773    "application/octet-stream".to_string()
774}
775
776fn extract_strings(contents: &[u8]) -> Vec<String> {
777    let mut strings = Vec::new();
778    let mut current_string = Vec::new();
779
780    tracing::debug!("Extracting strings from {} bytes", contents.len());
781
782    for &byte in contents {
783        if byte.is_ascii_graphic() || byte == b' ' || byte == b'\t' {
784            current_string.push(byte);
785        } else {
786            if current_string.len() >= 3 {
787                // Reduced minimum for small files
788                if let Ok(s) = String::from_utf8(current_string.clone()) {
789                    // Filter out very common/useless strings
790                    if !s.trim().is_empty() && !is_junk_string(&s) {
791                        strings.push(s.trim().to_string());
792                    }
793                }
794            }
795            current_string.clear();
796        }
797    }
798
799    // Process any remaining string
800    if current_string.len() >= 3 {
801        if let Ok(s) = String::from_utf8(current_string) {
802            if !s.trim().is_empty() && !is_junk_string(&s) {
803                strings.push(s.trim().to_string());
804            }
805        }
806    }
807
808    // Deduplicate and limit
809    strings.sort();
810    strings.dedup();
811    strings.truncate(50);
812
813    tracing::debug!("Extracted {} strings", strings.len());
814    strings
815}
816
817fn is_junk_string(s: &str) -> bool {
818    // Filter out strings that are likely padding or noise
819    s.chars().all(|c| c == '\0' || c == ' ') ||
820    s.len() > 200 || // Very long strings are often noise
821    s.chars().all(|c| c.is_ascii_punctuation())
822}
823
824#[cfg(test)]
825mod tests {
826    use super::*;
827
828    #[tokio::test]
829    async fn test_analyze_empty() {
830        let crypto_provider = CryptoProvider::new(false, false).unwrap();
831        let result = analyze_binary("test.bin", &[], &crypto_provider).await;
832        assert!(result.is_ok());
833        let analysis = result.unwrap();
834        assert_eq!(analysis.file_name, "test.bin");
835        assert_eq!(analysis.size_bytes, 0);
836    }
837}