droidsaw 2.0.0

DROIDSAW — unified Android reverse engineering CLI. Hermes, DEX, APK signing. JSON output, MCP server. Bytecode is not a security layer.
Documentation
// SPDX-License-Identifier: BSD-3-Clause

//! Corpus-runner extractor trait + per-APK context.
//!
//! The orchestrator opens each APK exactly once, builds an [`ApkContext`]
//! over the parsed state, and dispatches to every registered extractor.
//! Each [`CorpusExtractor`] returns a `Vec<KvPair>` for that APK; the
//! orchestrator writes the facts to the unified sqlite ledger.
//!
//! Two-phase trait shape:
//! - [`CorpusExtractor::prepare`] runs once per APK per extractor before
//!   any `extract` call. It returns the extractor's per-APK cache type
//!   (e.g. R8 recogniser's `TrampolineCensus`). Cache is dropped at the
//!   end of the APK's dispatch.
//! - [`CorpusExtractor::extract`] takes both the context and the cache
//!   and returns the facts.
//!
//! Cache type is per-extractor (associated type). Object-safe dispatch
//! is provided by the [`DynCorpusExtractor`] blanket impl, which hides
//! the cache type by running `prepare` and `extract` back-to-back inside
//! one method call.
//!
//! The extractor trait design enforces cache isolation between
//! independent extractors while simplifying the async harness.

use droidsaw_apk::Apk;
use droidsaw_common::corpus::{ExtractorError, KvPair};
use droidsaw_dex::DexFile;

use crate::context::HbcOwned;

/// Per-APK state passed to every registered extractor for one APK.
///
/// Constructed by the orchestrator from a parsed [`crate::context::CrossLayerContext`]
/// (see [`Self::from_layers`]) or built field-by-field in tests / mocks.
/// All fields are borrowed; the orchestrator owns the underlying state
/// and re-uses it across the registered extractors for one APK.
///
/// Adding new bundle state extractors might want (e.g. an enriched
/// signing metadata view, a pre-built per-method CFG cache shared across
/// extractors) is a backward-compatible additive change to this struct.
///
/// `Debug` is not derived because `HbcOwned` (a `self_cell`-built
/// owned/borrowed pair) does not implement `Debug`. Use the
/// individual fields directly for diagnostics, or hand-impl `Debug`
/// in a consumer that needs it.
pub struct ApkContext<'a> {
    /// Canonical SHA256 of the APK file, hex-encoded. This is the ledger's
    /// `facts.apk_sha256` column key; stability across runs is load-bearing.
    pub apk_sha256: &'a str,

    /// Path to the APK input on disk (informational; not the ledger key).
    /// May be a synthetic path for in-memory test APKs.
    pub apk_path: &'a str,

    /// Parsed APK container if any. `None` for raw-HBC / raw-DEX inputs.
    pub apk: Option<&'a Apk>,

    /// Parsed DEX files in their order from the APK (or one element for
    /// a raw-DEX input). Empty if input had no DEX content.
    pub dex_files: &'a [DexFile],

    /// Parsed HBC bundle, if any.
    pub hbc: Option<&'a HbcOwned>,
}

impl<'a> ApkContext<'a> {
    /// Build an `ApkContext` borrowing from a parsed `CrossLayerContext`.
    /// The orchestrator pre-computes `apk_sha256` (typically from the file
    /// bytes before parse) and passes both in.
    #[must_use]
    pub fn from_layers(
        apk_sha256: &'a str,
        layers: &'a crate::context::CrossLayerContext,
    ) -> Self {
        Self {
            apk_sha256,
            apk_path: layers.path.as_str(),
            apk: layers.apk.as_ref(),
            dex_files: layers.dex.as_slice(),
            hbc: layers.hbc.as_ref(),
        }
    }
}

/// The corpus-runner extractor trait. Implementations live in the top
/// binary (one per extractor); they typically delegate to a `pub fn` helper
/// in the relevant bundle crate (`droidsaw_dex::corpus::*`,
/// `droidsaw_apk::corpus::*`, etc.) that operates on already-parsed state.
///
/// Two-phase API:
/// - [`Self::prepare`] returns the extractor's per-APK cache. For
///   extractors with no precomputed state (most), `Cache = ()` is fine.
/// - [`Self::extract`] returns the facts using the cache. The cache lives
///   only for the duration of one APK's dispatch — no cross-APK state.
///
/// Both methods return `Result<_, ExtractorError>`. The orchestrator
/// records an `extractor_failed` fact on `Err` and continues with the
/// next extractor; one extractor's failure never aborts a run.
///
/// **`extractor_id` stability**: the returned string is the ledger key
/// for every fact this extractor emits. Renaming it splits cohorts in
/// the ledger and breaks downstream analysis joins. Behavior changes
/// ship under a new id; never rename in place.
pub trait CorpusExtractor: Sync + Send {
    /// Per-APK cache type. Use `()` if the extractor needs no precomputed
    /// state. Use a struct for state worth caching across the `extract`
    /// call(s) (e.g. R8 recogniser's `TrampolineCensus`, a per-DEX
    /// dominator graph, etc.).
    type Cache: Send + Sync;

    /// Stable identifier for this extractor. Used as the ledger's
    /// `facts.extractor_id` column for every fact emitted by `extract`.
    /// Must be stable across commits (by design — renaming is not allowed).
    fn extractor_id(&self) -> &'static str;

    /// Build the per-APK cache before any `extract` call. Runs exactly
    /// once per APK per extractor. The orchestrator drops the returned
    /// cache after `extract` returns.
    ///
    /// # Errors
    /// Returns [`ExtractorError`] if the APK lacks state this extractor
    /// requires (e.g. an R8 extractor on an APK with zero DEX files).
    fn prepare(&self, ctx: &ApkContext<'_>) -> Result<Self::Cache, ExtractorError>;

    /// Extract facts using the pre-built cache. Returns one [`KvPair`]
    /// per fact this extractor wants written to the ledger for this APK.
    ///
    /// Empty `Vec` is valid — means "no facts for this APK". The
    /// orchestrator writes nothing in that case (not even a marker row).
    ///
    /// # Errors
    /// Returns [`ExtractorError`] if extraction fails after `prepare`
    /// succeeded (e.g. resource budget exhausted mid-walk).
    fn extract(
        &self,
        ctx: &ApkContext<'_>,
        cache: &Self::Cache,
    ) -> Result<Vec<KvPair>, ExtractorError>;
}

/// Object-safe wrapper trait for heterogeneous extractor registry.
///
/// The orchestrator holds `Vec<Arc<dyn DynCorpusExtractor>>` (or
/// `Vec<Box<dyn DynCorpusExtractor>>`). The blanket impl runs the
/// two-phase `prepare` → `extract` sequence inside a single method call,
/// hiding the associated `Cache` type from the registry.
///
/// Cache lives on the stack within `run`; dropped at end of call. No
/// persistent state. No `RefCell`. No panic risk from interior
/// mutability — enforced by typed error handling throughout.
pub trait DynCorpusExtractor: Sync + Send {
    /// Stable identifier — see [`CorpusExtractor::extractor_id`].
    fn extractor_id(&self) -> &'static str;

    /// Run the full prepare-then-extract sequence on one APK. Cache is
    /// constructed inside this call and dropped before returning.
    ///
    /// # Errors
    /// Forwards the first `Err` from either `prepare` or `extract`. If
    /// `prepare` errors, `extract` is not called.
    fn run(&self, ctx: &ApkContext<'_>) -> Result<Vec<KvPair>, ExtractorError>;
}

impl<T> DynCorpusExtractor for T
where
    T: CorpusExtractor + Sync + Send,
{
    fn extractor_id(&self) -> &'static str {
        CorpusExtractor::extractor_id(self)
    }

    fn run(&self, ctx: &ApkContext<'_>) -> Result<Vec<KvPair>, ExtractorError> {
        let cache = self.prepare(ctx)?;
        self.extract(ctx, &cache)
    }
}

#[cfg(test)]
mod tests {
    use std::sync::Arc;

    use droidsaw_common::corpus::ExtractorValue;

    use super::*;

    /// Mock extractor with `Cache = ()` returning a fixed fact list.
    struct MockSimple;

    impl CorpusExtractor for MockSimple {
        type Cache = ();

        fn extractor_id(&self) -> &'static str {
            "mock_simple"
        }

        fn prepare(&self, _ctx: &ApkContext<'_>) -> Result<Self::Cache, ExtractorError> {
            Ok(())
        }

        fn extract(
            &self,
            _ctx: &ApkContext<'_>,
            _cache: &Self::Cache,
        ) -> Result<Vec<KvPair>, ExtractorError> {
            Ok(vec![
                KvPair::new("mock_count", ExtractorValue::Int(7)),
                KvPair::new("mock_ok", ExtractorValue::Bool(true)),
            ])
        }
    }

    /// Mock extractor with a non-trivial `Cache` type to exercise the
    /// two-phase dispatch path. `prepare` computes a value the orchestrator
    /// can't see; `extract` reads it from the cache and emits as a fact.
    struct MockCached;

    struct CountCache {
        precomputed_count: i64,
    }

    impl CorpusExtractor for MockCached {
        type Cache = CountCache;

        fn extractor_id(&self) -> &'static str {
            "mock_cached"
        }

        fn prepare(&self, ctx: &ApkContext<'_>) -> Result<Self::Cache, ExtractorError> {
            // Mirrors R8 recogniser's prepare — compute once, reuse in extract.
            // Here: just count the dex files.
            let count = i64::try_from(ctx.dex_files.len())
                .map_err(|e| ExtractorError::Internal(format!("dex count overflow: {e}")))?;
            Ok(CountCache {
                precomputed_count: count,
            })
        }

        fn extract(
            &self,
            _ctx: &ApkContext<'_>,
            cache: &Self::Cache,
        ) -> Result<Vec<KvPair>, ExtractorError> {
            Ok(vec![KvPair::new(
                "cached_dex_count",
                ExtractorValue::Int(cache.precomputed_count),
            )])
        }
    }

    /// Mock extractor that always fails in `prepare`. Verifies the
    /// orchestrator's `extract` is NOT called when `prepare` errors.
    struct MockPrepareFails;

    impl CorpusExtractor for MockPrepareFails {
        type Cache = ();

        fn extractor_id(&self) -> &'static str {
            "mock_prepare_fails"
        }

        fn prepare(&self, _ctx: &ApkContext<'_>) -> Result<Self::Cache, ExtractorError> {
            Err(ExtractorError::MissingState("nothing here"))
        }

        fn extract(
            &self,
            _ctx: &ApkContext<'_>,
            _cache: &Self::Cache,
        ) -> Result<Vec<KvPair>, ExtractorError> {
            unreachable!("must not be called when prepare errored")
        }
    }

    fn mock_ctx() -> (String, String) {
        (
            String::from(
                "0000000000000000000000000000000000000000000000000000000000000000",
            ),
            String::from("/test/synthetic.apk"),
        )
    }

    fn build_ctx<'a>(sha: &'a str, path: &'a str) -> ApkContext<'a> {
        ApkContext {
            apk_sha256: sha,
            apk_path: path,
            apk: None,
            dex_files: &[],
            hbc: None,
        }
    }

    #[test]
    fn simple_extractor_dispatch_returns_expected_facts() {
        let (sha, path) = mock_ctx();
        let ctx = build_ctx(&sha, &path);
        let ex = MockSimple;
        let facts = DynCorpusExtractor::run(&ex, &ctx).expect("run ok");
        assert_eq!(facts.len(), 2);
        assert_eq!(facts[0].key, "mock_count");
        assert_eq!(facts[0].value, ExtractorValue::Int(7));
        assert_eq!(facts[1].key, "mock_ok");
        assert_eq!(facts[1].value, ExtractorValue::Bool(true));
    }

    #[test]
    fn cached_extractor_prepare_runs_before_extract() {
        let (sha, path) = mock_ctx();
        let ctx = build_ctx(&sha, &path);
        let ex = MockCached;
        let facts = DynCorpusExtractor::run(&ex, &ctx).expect("run ok");
        assert_eq!(facts.len(), 1);
        assert_eq!(facts[0].key, "cached_dex_count");
        // dex_files is empty in mock — cache should have computed 0.
        assert_eq!(facts[0].value, ExtractorValue::Int(0));
    }

    #[test]
    fn prepare_error_short_circuits_extract() {
        let (sha, path) = mock_ctx();
        let ctx = build_ctx(&sha, &path);
        let ex = MockPrepareFails;
        let err = DynCorpusExtractor::run(&ex, &ctx).unwrap_err();
        assert!(
            matches!(err, ExtractorError::MissingState("nothing here")),
            "expected MissingState, got {err:?}",
        );
        // If extract had been called, the unreachable!() would have panicked
        // the test. Reaching this assertion means the short-circuit worked.
    }

    #[test]
    fn extractor_id_stable_across_calls() {
        let ex = MockSimple;
        let id1 = DynCorpusExtractor::extractor_id(&ex);
        let id2 = DynCorpusExtractor::extractor_id(&ex);
        let id3 = DynCorpusExtractor::extractor_id(&ex);
        assert_eq!(id1, id2);
        assert_eq!(id2, id3);
        assert_eq!(id1, "mock_simple");
    }

    #[test]
    fn heterogeneous_registry_dispatches_all_extractors() {
        // The actual orchestrator use case: Vec<Arc<dyn DynCorpusExtractor>>
        // holds extractors with different Cache types behind one dyn-trait.
        let (sha, path) = mock_ctx();
        let ctx = build_ctx(&sha, &path);
        let registry: Vec<Arc<dyn DynCorpusExtractor>> = vec![
            Arc::new(MockSimple),
            Arc::new(MockCached),
        ];

        let mut all_facts: Vec<(String, KvPair)> = Vec::new();
        for ex in &registry {
            let id = ex.extractor_id().to_owned();
            let facts = ex.run(&ctx).expect("run ok");
            for f in facts {
                all_facts.push((id.clone(), f));
            }
        }

        assert_eq!(all_facts.len(), 3);
        assert_eq!(all_facts[0].0, "mock_simple");
        assert_eq!(all_facts[0].1.key, "mock_count");
        assert_eq!(all_facts[1].0, "mock_simple");
        assert_eq!(all_facts[1].1.key, "mock_ok");
        assert_eq!(all_facts[2].0, "mock_cached");
        assert_eq!(all_facts[2].1.key, "cached_dex_count");
    }

    #[test]
    fn one_extractor_failure_does_not_abort_others_in_loop() {
        // Mirror the orchestrator's intended fault-isolation:
        // failed extractors get recorded as 'extractor_failed' facts,
        // others continue.
        let (sha, path) = mock_ctx();
        let ctx = build_ctx(&sha, &path);
        let registry: Vec<Arc<dyn DynCorpusExtractor>> = vec![
            Arc::new(MockSimple),
            Arc::new(MockPrepareFails),
            Arc::new(MockCached),
        ];

        let mut successes = 0_usize;
        let mut failures = 0_usize;
        for ex in &registry {
            match ex.run(&ctx) {
                Ok(facts) => {
                    assert!(!facts.is_empty(), "{} produced empty facts", ex.extractor_id());
                    successes = successes.checked_add(1).unwrap_or(successes);
                }
                Err(_) => {
                    failures = failures.checked_add(1).unwrap_or(failures);
                }
            }
        }
        assert_eq!(successes, 2);
        assert_eq!(failures, 1);
    }
}