use std::collections::HashMap;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::OnceLock;
use ferrum_bench_core::{global_profile, profile_fields_from_json};
use ferrum_interfaces::kv_dtype::{KvDtypeKind, KvFp16};
use ferrum_kernels::backend::{
Backend, BackendGraph, BackendMoeFused, BackendPagedKv, BackendQuantGguf, BackendQuantMarlin,
KvCache, LlmBackend, MoeLlmBackend, QuantLlmBackend,
};
use ferrum_quantization::WeightLoader;
use ferrum_types::{FerrumError, Result};
use crate::common::paged_pool::{block_hash_chain, BlockHash};
use crate::common::{DecoderOnlyLLM, LlmRuntimeConfig};
use crate::models::llama_family::{LlamaFamilyConfig, LlamaFamilyLayer, RopeCache};
use crate::models::qwen3_moe_profile::*;
use crate::models::qwen3_moe_runtime::Qwen3MoeRuntimeEnv;
use crate::moe::{moe_forward, ExpertStack};
use crate::moe_config::Qwen3MoeConfig;
mod api;
mod decode_batch;
mod forward_layer;
mod kv;
mod load;
mod prefill_decode;
mod prefix_cache;
mod scratch;
pub use scratch::{Qwen3MoeLayerState, Qwen3MoeScratch};
pub struct Qwen3MoeModel<B: MoeLlmBackend, K: KvDtypeKind = KvFp16> {
pub cfg: Qwen3MoeConfig,
pub runtime_cfg: LlmRuntimeConfig,
pub(crate) runtime_env: Qwen3MoeRuntimeEnv,
pub(crate) supports_varlen_qkv: bool,
pub(crate) supports_batched_moe_gemv: bool,
pub embed: B::Buffer,
pub attn_layers: Vec<LlamaFamilyLayer<B>>,
pub moe_layers: Vec<Qwen3MoeLayerState<B>>,
pub final_norm_w: B::Buffer,
pub lm_head: Box<dyn ferrum_quantization::Linear<B>>,
pub rope: RopeCache<B>,
pub scratch: Qwen3MoeScratch<B>,
pub kv_caches: HashMap<String, Vec<KvCache<B, K>>>,
kv_free_pool: Vec<Vec<KvCache<B, K>>>,
pub paged_pools: Option<Vec<(B::Buffer, B::Buffer)>>,
pub paged_fa_pools: Option<Vec<(B::Buffer, B::Buffer)>>,
pub paged_block_alloc: Option<std::sync::Mutex<crate::common::paged_pool::BlockAllocator>>,
pub paged_dims: Option<(usize, usize)>,
pub(crate) batched_graph_warmup: usize,
pub(crate) batched_graph_failed: bool,
pub(crate) batched_graph_keys_seen: std::collections::HashSet<u64>,
pub(crate) prefix_cache_hits: u64,
pub(crate) prefix_cache_misses: u64,
pub(crate) prefix_cache_saved_prefill_tokens: u64,
pub(crate) use_vllm_paged_attn: bool,
}