calltrace-rs 1.1.4

//! # CallTrace - High-Performance Function Call Tracing Library
//!
//! CallTrace is a Rust library that provides comprehensive function call tracing capabilities
//! using GCC's `-finstrument-functions` feature. It captures function calls, arguments, return values,
//! and call relationships across multiple threads with minimal performance overhead.
//!
//! ## Features
//!
//! - **Zero-overhead when disabled**: Atomic fast-path checking
//! - **Thread-safe**: Full support for multi-threaded applications
//! - **Argument capture**: DWARF-based type-aware argument extraction
//! - **Return value tracing**: RAX/XMM0 register capture
//! - **Crash analysis**: Comprehensive crash reporting with stack traces
//! - **JSON output**: Structured, hierarchical call tree export
//! - **Symbol resolution**: Function name resolution with C++ demangling
//! - **Memory safe**: 100% Rust implementation with zero memory leaks
//!
//! ## Quick Start
//!
//! ### 1. Compile your C/C++ program with instrumentation
//!
//! ```bash
//! gcc -rdynamic -finstrument-functions -g your_program.c -o your_program
//! ```
//!
//! Required flags:
//! - `-rdynamic`: Export symbols for dladdr() resolution
//! - `-finstrument-functions`: Enable GCC instrumentation hooks  
//! - `-g`: Provide DWARF debug information for argument capture
//!
//! ### 2. Run with CallTrace
//!
//! ```bash
//! # Basic tracing
//! CALLTRACE_OUTPUT=trace.json LD_PRELOAD=./libcalltrace.so ./your_program
//!
//! # With argument capture (higher overhead)
//! CALLTRACE_CAPTURE_ARGS=1 CALLTRACE_OUTPUT=trace.json LD_PRELOAD=./libcalltrace.so ./your_program
//!
//! # With debug output
//! CALLTRACE_DEBUG=1 CALLTRACE_OUTPUT=trace.json LD_PRELOAD=./libcalltrace.so ./your_program
//! ```
//!
//! ### 3. Analyze the results
//!
//! The generated JSON file contains a hierarchical view of all function calls:
//!
//! ```json
//! {
//!   "metadata": {
//!     "version": "1.0.0",
//!     "timestamp": "2024-01-15T10:30:00Z",
//!     "total_calls": 1250,
//!     "threads": 4
//!   },
//!   "call_trees": {
//!     "12345": {
//!       "thread_id": 12345,
//!       "calls": [
//!         {
//!           "function": "main",
//!           "address": "0x401020",
//!           "start_time": "10:30:00.123456",
//!           "end_time": "10:30:00.987654",
//!           "arguments": [...],
//!           "children": [...]
//!         }
//!       ]
//!     }
//!   }
//! }
//! ```
//!
//! ## Environment Variables
//!
//! | Variable | Default | Description |
//! |----------|---------|-------------|
//! | `CALLTRACE_OUTPUT` | `{executable}.json` | Output file path |
//! | `CALLTRACE_CAPTURE_ARGS` | `false` | Enable argument capture |
//! | `CALLTRACE_MAX_DEPTH` | `100` | Maximum call depth |
//! | `CALLTRACE_DEBUG` | `false` | Enable debug output |
//! | `CALLTRACE_PRETTY_JSON` | `true` | Pretty-print JSON |
//!
//! ## Performance
//!
//! CallTrace is designed for production use with minimal overhead:
//!
//! - **Function calls**: ~50-100ns overhead per call (argument capture disabled)
//! - **Memory usage**: ~2MB for 10,000 function calls
//! - **Thread safety**: Zero data races, lock-free where possible
//! - **Atomic fast-path**: <5ns when tracing is disabled
//!
//! ## Architecture
//!
//! CallTrace uses a modular architecture:
//!
//! - [`cyg_profile`]: GCC instrumentation hooks entry points
//! - [`call_tree`]: Thread-safe call tree management
//! - [`dwarf_analyzer`]: DWARF debug information parsing
//! - [`register_reader`]: x86_64 register and argument capture  
//! - [`json_output`]: Structured JSON serialization
//! - [`crash_handler`]: Signal handling and crash analysis
//! - [`build_validator`]: Compilation requirement validation
//!
//! ## Safety and Reliability
//!
//! - **Memory safe**: No manual memory management, RAII throughout
//! - **Thread safe**: DashMap and `Arc<RwLock<T>>` for concurrent access
//! - **Signal safe**: Proper signal handler chaining and restoration
//! - **Crash resilient**: Comprehensive crash analysis and recovery
//! - **Production tested**: Extensive integration and stress testing
//!
//! ## Platform Support
//!
//! - **Architecture**: x86_64
//! - **Operating System**: Linux (with glibc)
//! - **Compiler**: GCC with `-finstrument-functions` support
//! - **Rust**: 2021 edition, stable channel
//!
//! ## Example Integration
//!
//! ```c
//! // your_program.c
//! #include <stdio.h>
//!
//! int fibonacci(int n) {
//!     if (n <= 1) return n;
//!     return fibonacci(n-1) + fibonacci(n-2);
//! }
//!
//! int main() {
//!     printf("fib(8) = %d\n", fibonacci(8));
//!     return 0;
//! }
//! ```
//!
//! Compile and trace:
//! ```bash
//! gcc -rdynamic -finstrument-functions -g your_program.c -o your_program
//! CALLTRACE_OUTPUT=fib_trace.json LD_PRELOAD=./libcalltrace.so ./your_program
//! ```
//!
//! This will generate a complete call tree showing the recursive fibonacci calls,
//! timing information, and call relationships.
//!
//! ## C API Functions
//!
//! The library exports C-compatible functions for manual control:
//!
//! - [`calltrace_init()`]: Initialize tracing (called automatically)
//! - [`calltrace_cleanup()`]: Cleanup and write output (called automatically)
//!
//! ## Internal Implementation Notes
//!
//! For developers contributing to CallTrace:
//!
//! - Uses `ctor`/`dtor` attributes for automatic initialization
//! - Thread-local storage for performance counters and buffers
//! - LRU caching for DWARF function information
//! - String interning for common type/function names
//! - Atomic operations for statistics and fast-path checks

#![allow(non_upper_case_globals)]
#![allow(non_camel_case_types)]
#![allow(non_snake_case)]

use once_cell::sync::Lazy;
use std::cell::RefCell;
use std::collections::{HashMap, HashSet};
use std::ffi::c_void;
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use std::sync::{Arc, Mutex, Once};

// Core modules
pub mod build_validator;
pub mod call_tree;
pub mod crash_handler;
pub mod cyg_profile;
pub mod dwarf_analyzer;
pub mod error;
pub mod json_output;
pub mod register_reader;

// Re-exports for C compatibility
pub use cyg_profile::{__cyg_profile_func_enter, __cyg_profile_func_exit};

use call_tree::CallTreeManager;
use dwarf_analyzer::DwarfAnalyzer;
use json_output::JsonOutputGenerator;

/// Global call tree manager
static CALL_TREE_MANAGER: Lazy<Arc<CallTreeManager>> =
    Lazy::new(|| Arc::new(CallTreeManager::new()));

/// Global DWARF analyzer
static DWARF_ANALYZER: Lazy<Arc<Mutex<Option<DwarfAnalyzer>>>> =
    Lazy::new(|| Arc::new(Mutex::new(None)));

/// Global configuration
static CONFIG: Lazy<Arc<Mutex<CallTraceConfig>>> =
    Lazy::new(|| Arc::new(Mutex::new(CallTraceConfig::default())));

/// Global initialization flag
static INIT: Once = Once::new();

/// Performance optimization: atomic flags for fast path checking
static TRACING_ENABLED: AtomicBool = AtomicBool::new(true);
static ARGUMENT_CAPTURE_ENABLED: AtomicBool = AtomicBool::new(false);
static FUNCTION_CALL_COUNT: AtomicU64 = AtomicU64::new(0);
static ARGUMENT_CAPTURE_COUNT: AtomicU64 = AtomicU64::new(0);

// Thread-local counters for batched atomic updates
thread_local! {
    static LOCAL_COUNTERS: RefCell<LocalCounters> = RefCell::new(LocalCounters::new());
}

/// Local per-thread counters to reduce atomic operations
#[derive(Debug)]
struct LocalCounters {
    function_calls: u64,
    argument_captures: u64,
    batch_size: u64,
}

impl LocalCounters {
    fn new() -> Self {
        Self {
            function_calls: 0,
            argument_captures: 0,
            batch_size: 100, // Flush every 100 operations
        }
    }

    #[inline]
    fn increment_function_calls(&mut self) -> u64 {
        self.function_calls += 1;

        // Batch update to global counter
        if self.function_calls % self.batch_size == 0 {
            let batch = self.function_calls;
            self.function_calls = 0;
            FUNCTION_CALL_COUNT.fetch_add(batch, Ordering::Relaxed) + batch
        } else {
            FUNCTION_CALL_COUNT.load(Ordering::Relaxed) + self.function_calls
        }
    }

    #[inline]
    fn increment_argument_captures(&mut self) {
        self.argument_captures += 1;

        // Batch update to global counter
        if self.argument_captures % self.batch_size == 0 {
            let batch = self.argument_captures;
            self.argument_captures = 0;
            ARGUMENT_CAPTURE_COUNT.fetch_add(batch, Ordering::Relaxed);
        }
    }

    // Flush remaining counts (called during cleanup)
    fn flush(&mut self) {
        if self.function_calls > 0 {
            FUNCTION_CALL_COUNT.fetch_add(self.function_calls, Ordering::Relaxed);
            self.function_calls = 0;
        }
        if self.argument_captures > 0 {
            ARGUMENT_CAPTURE_COUNT.fetch_add(self.argument_captures, Ordering::Relaxed);
            self.argument_captures = 0;
        }
    }
}

/// Type alias for the function info cache to reduce complexity
type FunctionInfoCache = Lazy<Arc<Mutex<HashMap<u64, Option<dwarf_analyzer::FunctionInfo>>>>>;

/// DWARF function information cache for performance optimization
static FUNCTION_INFO_CACHE: FunctionInfoCache = Lazy::new(|| Arc::new(Mutex::new(HashMap::new())));

/// String intern pool for reducing memory allocations
/// Stores commonly used strings like type names, function names
static STRING_INTERN_POOL: Lazy<Arc<Mutex<HashSet<String>>>> =
    Lazy::new(|| Arc::new(Mutex::new(HashSet::new())));

// Pre-allocated argument buffer pool to reduce allocations
thread_local! {
    static ARGUMENT_BUFFER_POOL: RefCell<Vec<Vec<register_reader::CapturedArgument>>> =
        const { RefCell::new(Vec::new()) };

    /// Pre-allocated string formatting buffers to reduce allocations
    static FORMAT_BUFFER: RefCell<String> = RefCell::new(String::with_capacity(64));
}

/// CallTrace runtime configuration
///
/// This structure holds all configuration options that control CallTrace behavior.
/// Configuration is typically loaded from environment variables during initialization,
/// but can be modified programmatically if needed.
///
/// # Configuration Sources
///
/// 1. **Environment Variables** (primary source, read at startup)
/// 2. **Default Values** (fallback when environment variables are not set)
/// 3. **Programmatic** (can be modified via unsafe access to CONFIG static)
///
/// # Fields
///
/// ## `enabled: bool`
/// Master enable/disable switch for all tracing functionality.
/// - **Environment**: `CALLTRACE_ENABLED` (1/true to enable)
/// - **Default**: `true`
/// - **Performance Impact**: When disabled, overhead is <5ns per function call
///
/// ## `capture_arguments: bool`
/// Enable expensive argument capture using DWARF debugging information.
/// - **Environment**: `CALLTRACE_CAPTURE_ARGS` (1/true to enable)
/// - **Default**: `false`
/// - **Performance Impact**: 10-50x overhead when enabled, depending on argument complexity
/// - **Requirements**: Target program must be compiled with `-g` flag
///
/// ## `output_file: Option<String>`
/// Path where JSON trace output will be written.
/// - **Environment**: `CALLTRACE_OUTPUT`
/// - **Default**: `{executable_name}.json` in the same directory as the executable
/// - **Special Values**:
///   - `None`: No output generated
///   - `"stderr"`: Output to stderr (not implemented)
///   - `"stdout"`: Output to stdout (not implemented)
///
/// ## `max_call_depth: usize`
/// Maximum depth of function call nesting to trace.
/// - **Environment**: `CALLTRACE_MAX_DEPTH`
/// - **Default**: `100`
/// - **Purpose**: Prevents infinite recursion and limits memory usage
/// - **Behavior**: Calls deeper than this limit are ignored
///
/// ## `pretty_json: bool`
/// Whether to format JSON output with indentation and newlines.
/// - **Environment**: `CALLTRACE_PRETTY_JSON` (1/true to enable)
/// - **Default**: `true`
/// - **Trade-off**: Readable output vs. smaller file size
///
/// # Examples
///
/// ## Reading Current Configuration
/// ```rust
/// // Note: This requires unsafe access to the CONFIG static
/// use calltrace::CallTraceConfig;
///
/// // This is how configuration is accessed internally
/// // (Not recommended for external use)
/// ```
///
/// ## Environment Variable Configuration
/// ```bash
/// # Minimal configuration - basic tracing only
/// CALLTRACE_OUTPUT=trace.json ./your_program
///
/// # Full featured configuration
/// CALLTRACE_OUTPUT=full_trace.json \
/// CALLTRACE_CAPTURE_ARGS=1 \
/// CALLTRACE_MAX_DEPTH=50 \
/// CALLTRACE_DEBUG=1 \
/// CALLTRACE_PRETTY_JSON=1 \
/// ./your_program
///
/// # Performance-optimized configuration
/// CALLTRACE_OUTPUT=perf_trace.json \
/// CALLTRACE_CAPTURE_ARGS=0 \
/// CALLTRACE_MAX_DEPTH=200 \
/// CALLTRACE_PRETTY_JSON=0 \
/// ./your_program
/// ```
///
/// # Performance Guidelines
///
/// - **Production**: Set `capture_arguments = false` for minimal overhead
/// - **Development**: Enable `capture_arguments = true` for detailed debugging
/// - **Deep Recursion**: Increase `max_call_depth` if needed, but monitor memory usage
/// - **File Size**: Set `pretty_json = false` for smaller output files
///
/// # Thread Safety
///
/// Configuration is read once during initialization and then considered immutable.
/// All fields can be safely accessed concurrently from multiple threads.
#[derive(Debug, Clone)]
pub struct CallTraceConfig {
    pub enabled: bool,
    pub capture_arguments: bool,
    pub output_file: Option<String>,
    pub max_call_depth: usize,
    pub pretty_json: bool,
}

impl Default for CallTraceConfig {
    fn default() -> Self {
        Self {
            enabled: true,
            capture_arguments: std::env::var("CALLTRACE_CAPTURE_ARGS")
                .map(|v| v == "1" || v.to_lowercase() == "true")
                .unwrap_or(false),
            output_file: std::env::var("CALLTRACE_OUTPUT")
                .ok()
                .or_else(generate_default_output_filename),
            max_call_depth: std::env::var("CALLTRACE_MAX_DEPTH")
                .ok()
                .and_then(|v| v.parse().ok())
                .unwrap_or(100),
            pretty_json: std::env::var("CALLTRACE_PRETTY_JSON")
                .map(|v| v == "1" || v.to_lowercase() == "true")
                .unwrap_or(true),
        }
    }
}

/// Initialize the CallTrace library
///
/// This function initializes the global CallTrace state, including:
/// - DWARF debug information analyzer
/// - Crash signal handlers  
/// - Performance monitoring counters
/// - Configuration from environment variables
///
/// # Safety
///
/// This function is automatically called when the library is loaded via `ctor` attributes.
/// It can be called multiple times safely - subsequent calls are no-ops.
///
/// # Returns
///
/// Always returns `0` for C compatibility.
///
/// # Environment Variables
///
/// The following environment variables are read during initialization:
/// - `CALLTRACE_OUTPUT`: Output file path
/// - `CALLTRACE_CAPTURE_ARGS`: Enable argument capture (1/true)  
/// - `CALLTRACE_MAX_DEPTH`: Maximum call depth (default: 100)
/// - `CALLTRACE_DEBUG`: Enable debug output (1/true)
/// - `CALLTRACE_PRETTY_JSON`: Pretty-print JSON (default: true)
///
/// # Examples
///
/// Usually called automatically, but can be invoked manually:
///
/// ```c
/// // In C code
/// extern int calltrace_init(void);
/// int result = calltrace_init(); // Returns 0 on success
/// ```
#[no_mangle]
pub extern "C" fn calltrace_init() -> i32 {
    INIT.call_once(|| {
        if let Err(e) = init_global_state() {
            eprintln!("CallTrace initialization failed: {:?}", e);
        }
    });
    0
}

/// Cleanup the CallTrace library and write final output
///
/// This function performs comprehensive cleanup of the CallTrace library:
/// - Flushes all pending thread-local performance counters
/// - Generates and writes the final JSON trace output
/// - Restores original signal handlers  
/// - Releases all allocated resources
///
/// # Safety
///
/// This function is automatically called when the library is unloaded via `dtor` attributes.
/// It handles cleanup gracefully even during abnormal program termination.
///
/// # Thread Safety
///
/// This function is designed to be signal-safe and can be called during:
/// - Normal program exit
/// - Library unloading
/// - Signal handler execution
/// - Thread destruction
///
/// # Output Generation
///
/// If `CALLTRACE_OUTPUT` was specified, this function will:
/// 1. Collect all call trees from all threads
/// 2. Generate metadata (timing, counters, environment)
/// 3. Serialize to structured JSON format
/// 4. Write atomically to the specified file
///
/// # Error Handling
///
/// Cleanup continues even if individual steps fail. Errors are logged to stderr
/// but do not prevent other cleanup operations from completing.
///
/// # Examples
///
/// Usually called automatically, but can be invoked manually:
///
/// ```c
/// // In C code - force immediate cleanup and output
/// extern void calltrace_cleanup(void);
/// calltrace_cleanup();
/// ```
///
/// # Performance Notes
///
/// - Thread-local counter flushing: O(number of threads)
/// - JSON generation: O(total function calls)
/// - File I/O: O(output file size)
/// - Signal handler restoration: O(number of signals)
#[no_mangle]
pub extern "C" fn calltrace_cleanup() {
    // Safely flush thread-local counters
    if std::panic::catch_unwind(|| {
        LOCAL_COUNTERS.with(|counters| counters.borrow_mut().flush());
    })
    .is_ok()
    {
        // Successfully flushed counters
    }
    // If flushing fails (during shutdown), continue cleanup anyway

    // Generate final JSON output
    if let Err(e) = write_final_output() {
        eprintln!("CallTrace cleanup failed: {:?}", e);
    }

    // Cleanup crash handler (restore original signal handlers)
    crash_handler::cleanup_crash_handler();
}

/// Initialize global state
fn init_global_state() -> Result<(), Box<dyn std::error::Error>> {
    // Initialize build validation first to ensure required compilation flags
    if let Err(e) = build_validator::init_build_validation() {
        eprintln!("CallTrace: Build validation initialization failed: {:?}", e);
    }

    // Initialize DWARF analyzer if argument capture is enabled
    let config = CONFIG.lock().unwrap();

    // Set atomic flags for fast path checking
    TRACING_ENABLED.store(config.enabled, Ordering::Relaxed);
    ARGUMENT_CAPTURE_ENABLED.store(config.capture_arguments, Ordering::Relaxed);

    // Always initialize DWARF analyzer for symbol resolution (independent of argument capture)
    let exe_path = std::env::current_exe()
        .or_else(|_| std::fs::read_link("/proc/self/exe"))
        .unwrap_or_else(|_| "/proc/self/exe".into());

    if let Some(exe_path_str) = exe_path.to_str() {
        match DwarfAnalyzer::new(exe_path_str) {
            Ok(analyzer) => {
                *DWARF_ANALYZER.lock().unwrap() = Some(analyzer);
                if std::env::var("CALLTRACE_DEBUG").is_ok() {
                    eprintln!("CallTrace: DWARF analyzer initialized for symbol resolution");
                }
            }
            Err(e) => {
                eprintln!("CallTrace: DWARF analyzer initialization failed: {:?}", e);
                eprintln!("CallTrace: Continuing with limited symbol resolution");
            }
        }
    }

    // Initialize crash handler if no existing handlers are present
    if let Err(e) = crash_handler::init_crash_handler() {
        eprintln!("CallTrace: Crash handler initialization failed: {:?}", e);
        eprintln!("CallTrace: Continuing without crash detection");
    }

    if std::env::var("CALLTRACE_DEBUG").is_ok() {
        eprintln!("CallTrace: Library initialized successfully");
    }
    Ok(())
}

/// Get or intern a string to reduce memory allocations
/// For commonly used strings like type names and function names
#[inline]
pub(crate) fn intern_string(s: &str) -> String {
    // For short, common strings, use the intern pool
    if s.len() <= 64
        && (s.starts_with("int")
            || s.starts_with("char")
            || s.starts_with("float")
            || s.starts_with("double")
            || s.starts_with("void")
            || s.starts_with("struct")
            || s.starts_with("0x"))
    {
        if let Ok(mut pool) = STRING_INTERN_POOL.lock() {
            if let Some(interned) = pool.get(s) {
                return interned.clone();
            } else {
                let owned = s.to_string();
                pool.insert(owned.clone());
                return owned;
            }
        }
    }
    // For longer or uncommon strings, just allocate normally
    s.to_string()
}

/// Get a pre-allocated argument buffer from the pool
/// This reduces Vec allocations in the hot path
#[inline]
fn get_argument_buffer() -> Vec<register_reader::CapturedArgument> {
    ARGUMENT_BUFFER_POOL.with(|pool| {
        let mut pool_ref = pool.borrow_mut();
        if let Some(mut buffer) = pool_ref.pop() {
            buffer.clear();
            buffer
        } else {
            Vec::with_capacity(16) // Pre-allocate for typical function argument count
        }
    })
}

/// Fast address formatting using pre-allocated buffer
/// Optimized for the common "0x{:x}" pattern
#[inline]
pub(crate) fn format_address(addr: u64) -> String {
    // During cleanup, TLS may be unavailable, so always use fallback
    // This is safer than trying to catch panics in TLS access
    format!("0x{:x}", addr)
}

/// Fast address formatting with prefix (e.g., "func_0x1234")
#[inline]
pub(crate) fn format_address_with_prefix(prefix: &str, addr: u64) -> String {
    // During cleanup, TLS may be unavailable, so always use fallback
    // This is safer than trying to catch panics in TLS access
    format!("{}_0x{:x}", prefix, addr)
}

/// Common string constants to avoid allocations
mod string_constants {
    pub const NULL_ADDRESS: &str = "0x0";
    pub const NULL_STRING: &str = "NULL";
    pub const CAPTURE_FAILED: &str = "Capture failed";
    pub const PTHREAD_CREATE: &str = "pthread_create";
    pub const X86_64: &str = "x86_64";
    pub const UNKNOWN_ERROR: &str = "Unknown error";
}

/// Get function information with caching for performance optimization
#[inline]
fn get_cached_function_info(func_addr: u64) -> Option<dwarf_analyzer::FunctionInfo> {
    // Fast path: check cache first
    if let Ok(cache) = FUNCTION_INFO_CACHE.lock() {
        if let Some(cached_result) = cache.get(&func_addr) {
            return cached_result.clone();
        }
    }

    // Cache miss: query DWARF analyzer
    let function_info = if let Some(ref mut analyzer) = DWARF_ANALYZER.lock().unwrap().as_mut() {
        analyzer.get_function_info(func_addr).ok()
    } else {
        None
    };

    // Store result in cache (both hits and misses)
    if let Ok(mut cache) = FUNCTION_INFO_CACHE.lock() {
        cache.insert(func_addr, function_info.clone());

        // Limit cache size to prevent memory bloat
        const MAX_CACHE_SIZE: usize = 1024;
        if cache.len() > MAX_CACHE_SIZE {
            // Remove oldest entries (simple LRU approximation)
            let keys_to_remove: Vec<u64> = cache.keys().take(MAX_CACHE_SIZE / 4).copied().collect();
            for key in keys_to_remove {
                cache.remove(&key);
            }
        }
    }

    function_info
}

/// Write final JSON output
fn write_final_output() -> Result<(), Box<dyn std::error::Error>> {
    let config = CONFIG.lock().unwrap().clone();

    if let Some(output_file) = &config.output_file {
        let generator = JsonOutputGenerator::new();
        let trace_session = generator.generate_output(&CALL_TREE_MANAGER)?;

        generator.write_to_file(&trace_session, output_file)?;
        eprintln!("CallTrace: Output written to {}", output_file);
    }

    Ok(())
}

/// Generate default output filename based on executable name
fn generate_default_output_filename() -> Option<String> {
    // Try to get the executable path
    let exe_path = std::env::current_exe()
        .or_else(|_| std::fs::read_link("/proc/self/exe"))
        .ok()?;

    // Get the file stem (filename without extension)
    let file_stem = exe_path.file_stem()?.to_str()?;

    // Get the directory where the executable is located
    let exe_dir = exe_path.parent()?;

    // Create the output filename in the same directory as the executable
    let output_path = exe_dir.join(format!("{}.json", file_stem));

    output_path.to_str().map(str::to_string)
}

/// Get the base output filename (without extension) for crash reports
pub(crate) fn get_base_output_filename() -> Option<String> {
    // First try to get from configuration
    if let Ok(config) = CONFIG.lock() {
        if let Some(ref output_file) = config.output_file {
            // Remove .json extension if present
            if output_file.ends_with(".json") {
                return Some(output_file[..output_file.len() - 5].to_string());
            } else {
                return Some(output_file.clone());
            }
        }
    }

    // Fallback: generate from executable name (without extension)
    let exe_path = std::env::current_exe()
        .or_else(|_| std::fs::read_link("/proc/self/exe"))
        .ok()?;

    let file_stem = exe_path.file_stem()?.to_str()?;
    let exe_dir = exe_path.parent()?;
    let base_path = exe_dir.join(file_stem);

    base_path.to_str().map(str::to_string)
}

/// Handle function entry (called from cyg_profile)
#[inline]
pub(crate) fn handle_function_enter_internal(
    func_address: *mut c_void,
    call_site: *mut c_void,
) -> Result<(), error::CallTraceError> {
    // Record function hook call for build validation
    build_validator::record_function_hook_call();

    // Fast path: check if tracing is enabled without locking
    if !TRACING_ENABLED.load(Ordering::Relaxed) {
        return Ok(());
    }

    let func_addr = func_address as u64;
    let call_site_addr = call_site as u64;

    // Increment call counter using thread-local batching
    let call_count =
        LOCAL_COUNTERS.with(|counters| counters.borrow_mut().increment_function_calls());

    // Ultra-fast path: if argument capture is disabled, skip expensive operations
    let arg_capture_enabled = ARGUMENT_CAPTURE_ENABLED.load(Ordering::Relaxed);
    if !arg_capture_enabled {
        // Minimal processing for maximum performance
        let _node_id = CALL_TREE_MANAGER.function_enter_fast_path(func_addr, call_site_addr)?;

        // Only reinforce crash handlers occasionally to minimize overhead
        if (call_count < 100 && call_count % 20 == 0) || call_count % 2000 == 0 {
            let _ = crash_handler::reinforce_crash_handlers();
        }

        return Ok(());
    }

    // Full path: argument capture enabled - do expensive operations
    // Re-install crash handlers only during early execution
    let should_reinforce = if call_count < 100 {
        call_count % 10 == 0 // Every 10 calls during startup
    } else {
        call_count % 1000 == 0 // Every 1000 calls after startup
    };

    if should_reinforce {
        let _ = crash_handler::reinforce_crash_handlers();
    }

    // Get function information for argument capture
    let function_info = get_cached_function_info(func_addr);

    LOCAL_COUNTERS.with(|counters| counters.borrow_mut().increment_argument_captures());

    let arguments = if let Some(ref func_info) = function_info {
        // Capture arguments using register reader
        let register_context = unsafe { register_reader::RegisterContext::capture().ok() };

        if let Some(ref context) = register_context {
            capture_function_arguments(func_info, context)
        } else {
            Vec::new()
        }
    } else {
        Vec::new()
    };

    // Add to call tree with full context
    let _node_id = CALL_TREE_MANAGER.function_enter(
        func_addr,
        call_site_addr,
        function_info,
        arguments,
        None, // register_context is consumed in argument capture
    )?;

    Ok(())
}

/// Handle function exit (called from cyg_profile)
#[inline]
pub(crate) fn handle_function_exit_internal(
    func_address: *mut c_void,
    _call_site: *mut c_void,
) -> Result<(), error::CallTraceError> {
    // Fast path: check if tracing is enabled without locking
    if !TRACING_ENABLED.load(Ordering::Relaxed) {
        return Ok(());
    }

    let func_addr = func_address as u64;

    // Ultra-fast path: if argument capture is disabled, skip expensive operations
    if !ARGUMENT_CAPTURE_ENABLED.load(Ordering::Relaxed) {
        // Minimal processing for maximum performance
        CALL_TREE_MANAGER.function_exit_fast_path(func_addr)?;
        return Ok(());
    }

    // Full path: capture return values when argument capture is enabled
    let return_value = {
        // Capture return value registers
        let return_context = unsafe { register_reader::capture_return_values().ok() };

        if let Some(ref context) = return_context {
            // Get function information for return type
            if let Some(func_info) = get_cached_function_info(func_addr) {
                // Extract return value based on function return type
                register_reader::extract_return_value(context, func_info.return_type.as_ref())
            } else {
                // No function info available, try to extract raw value
                if context.return_valid {
                    // Default to integer interpretation of RAX
                    Some(register_reader::ArgumentValue::Integer(context.return_rax))
                } else {
                    None
                }
            }
        } else {
            None
        }
    };

    // Handle function exit in call tree with return value
    CALL_TREE_MANAGER.function_exit_with_return_value(func_addr, return_value)?;

    Ok(())
}

/// Capture function arguments using register context and function info
/// Optimized version with minimal allocations and object pooling
#[inline]
fn capture_function_arguments(
    function_info: &dwarf_analyzer::FunctionInfo,
    register_context: &register_reader::RegisterContext,
) -> Vec<register_reader::CapturedArgument> {
    let param_count = function_info.parameters.len();

    // Early return for functions with no parameters
    if param_count == 0 {
        return Vec::new();
    }

    // Get a pre-allocated buffer from the pool
    let mut arguments = get_argument_buffer();

    // Ensure sufficient capacity
    if arguments.capacity() < param_count {
        arguments.reserve(param_count - arguments.capacity());
    }

    // Limit argument capture to reasonable number to prevent excessive overhead
    const MAX_ARGS: usize = 16;
    let max_args = std::cmp::min(param_count, MAX_ARGS);

    for (i, param) in function_info.parameters.iter().enumerate().take(max_args) {
        let location = register_reader::classify_argument(
            &param.type_info.name,
            param.type_info.size.unwrap_or(8) as usize,
            i,
        );

        // Try enhanced extraction for complex types, fallback to basic for simple types
        let value = if param.type_info.is_struct
            || param.type_info.is_array
            || (param.type_info.is_pointer && param.type_info.base_type.is_some())
        {
            register_reader::extract_argument_with_type_info(
                register_context,
                &location,
                &param.type_info,
                param,
            )
        } else {
            // Fast path for basic types
            register_reader::extract_argument(
                register_context,
                &location,
                &param.type_info.name,
                param.type_info.is_pointer,
            )
        };

        let captured_arg = register_reader::CapturedArgument {
            name: intern_string(&param.name),
            type_name: intern_string(&param.type_info.name),
            location,
            value: match &value {
                Ok(v) => v.clone(),
                Err(_) => register_reader::ArgumentValue::Unknown {
                    type_name: intern_string(&param.type_info.name),
                    raw_data: Vec::new(),
                    error: Some(string_constants::CAPTURE_FAILED.to_string()),
                },
            },
            valid: value.is_ok(),
            error: value.err().map(|e| format!("{:?}", e)),
        };

        arguments.push(captured_arg);
    }

    arguments
}

/// Library constructor - automatically called when loaded
#[ctor::ctor]
fn library_init() {
    calltrace_init();
}

/// Library destructor - automatically called when unloaded
#[ctor::dtor]
fn library_cleanup() {
    calltrace_cleanup();
}