Struct EvalCodingAgentArgs

Source

pub struct EvalCodingAgentArgs {Show 22 fields
    pub fixtures: Vec<String>,
    pub models: Vec<String>,
    pub tool_formats: Vec<String>,
    pub output: Option<PathBuf>,
    pub env_files: Vec<PathBuf>,
    pub include_local: bool,
    pub local_providers: Vec<String>,
    pub max_local_models: usize,
    pub keep_local_after_run: bool,
    pub max_runs: Option<usize>,
    pub replicates: usize,
    pub max_iterations: usize,
    pub python: String,
    pub fail_on_unauthorized: bool,
    pub json: bool,
    pub step_judge: Option<String>,
    pub step_judge_on_veto: Option<String>,
    pub step_judge_adversarial: bool,
    pub override_reason: Option<String>,
    pub structural_validator: Option<String>,
    pub run_label: String,
    pub baseline_comparison_against: Option<PathBuf>,
}

Fields§

§fixtures: Vec<String>

Fixture ids to run (comma-separated, repeatable). Use all for the full suite.

§models: Vec<String>

Model selectors to run (comma-separated, repeatable). Each entry may be an alias, provider:model, or provider=...,model=....

§tool_formats: Vec<String>

Tool-call rendering modes to compare.

§output: Option<PathBuf>

Output directory for summary.json, per_run.jsonl, transcripts, and markdown reports.

§env_files: Vec<PathBuf>

Optional .env file(s) to load for provider credentials. Values are never written to artifacts.

§include_local: bool

Append reachable local Ollama/llama.cpp/MLX/vLLM models to the selected matrix.

§local_providers: Vec<String>

Restrict local discovery to one provider id. Repeatable.

§max_local_models: usize

Maximum discovered local models to append.

§keep_local_after_run: bool

Leave newly-loaded Ollama models running after each local benchmark run.

§max_runs: Option<usize>

Stop after N matrix entries, useful for cost-capped smoke runs.

§replicates: usize

Number of independent trials for each fixture/model/tool-format cell.

§max_iterations: usize

Maximum repair-agent loop iterations per run.

§python: String

Python executable used by the fixture and verification command.

§fail_on_unauthorized: bool

Treat missing credentials as an error instead of skipping the run.

§json: bool

Print the aggregate summary JSON to stdout.

§step_judge: Option<String>

Optional step_judge config applied to every run in this invocation. Accepts a preset name (symmetric-cheap, asymmetric, symmetric-strong) which expands to a known {model, provider} pair, or custom:<json> for a literal JSON object passed verbatim to agent_loop({step_judge: ...}). Omit (or pass none / off) to disable. For matrix sweeps across presets, the step-judge experiment driver at experiments/step-judge/run.sh invokes the eval runner once per preset and aggregates.

§step_judge_on_veto: Option<String>

Override the on_veto remediation shape for the step-judge config (replace or retain). Default is replace.

§step_judge_adversarial: bool

Use the adversarial rubric variant.

§override_reason: Option<String>

Free-form reason attached when forcing a tool format against catalog guidance.

§structural_validator: Option<String>

Structural-validator config applied to every run in this invocation. Omit to use the suite default (currently the 4-rule validator). Accepts on / default, off / none, or custom:<json> for a literal JSON object passed to with_structural_validator(...).

§run_label: String

Free-form label persisted in summary.json for grouping repeat runs (e.g. “replicate-1”, “probe-judge-arch-gpt”). Defaults to empty.

§baseline_comparison_against: Option<PathBuf>

Path to a previous coding-agent summary.json (or its parent dir). When present, the new summary embeds a baseline_comparison block listing per-fixture regressions (baseline passed but this cell failed) and recoveries (baseline failed but this cell passed), plus aggregate counts and a net lift in percentage points. Useful for cross-cell A/Bs (provider sweep, prompt change, step judge on/off) where net pass-rate hides destructive interactions like the cli-help-flag regression the step-judge experiment surfaced (harn#2318).

EvalCodingAgentArgs

Struct EvalCodingAgentArgs Copy item path

Fields§

Trait Implementations§

impl Args for EvalCodingAgentArgs

fn group_id() -> Option<Id>

fn augment_args<'b>(__clap_app: Command) -> Command

fn augment_args_for_update<'b>(__clap_app: Command) -> Command

impl Debug for EvalCodingAgentArgs

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl FromArgMatches for EvalCodingAgentArgs

fn from_arg_matches(__clap_arg_matches: &ArgMatches) -> Result<Self, Error>

fn from_arg_matches_mut( __clap_arg_matches: &mut ArgMatches, ) -> Result<Self, Error>

fn update_from_arg_matches( &mut self, __clap_arg_matches: &ArgMatches, ) -> Result<(), Error>

fn update_from_arg_matches_mut( &mut self, __clap_arg_matches: &mut ArgMatches, ) -> Result<(), Error>

Auto Trait Implementations§

impl Freeze for EvalCodingAgentArgs

impl RefUnwindSafe for EvalCodingAgentArgs

impl Send for EvalCodingAgentArgs

impl Sync for EvalCodingAgentArgs

impl Unpin for EvalCodingAgentArgs

impl UnsafeUnpin for EvalCodingAgentArgs

impl UnwindSafe for EvalCodingAgentArgs

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<'a, T, E> AsTaggedExplicit<'a, E> for Twhere T: 'a,

fn explicit(self, class: Class, tag: u32) -> TaggedParser<'a, Explicit, Self, E>

impl<'a, T, E> AsTaggedImplicit<'a, E> for Twhere T: 'a,

fn implicit( self, class: Class, constructed: bool, tag: u32, ) -> TaggedParser<'a, Implicit, Self, E>

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<ST, DT> CastableFrom<ST, Initialized, Initialized> for DTwhere ST: ?Sized, DT: ?Sized,

impl<ST, DT> CastableFrom<ST, Uninit, Uninit> for DTwhere ST: ?Sized, DT: ?Sized,

impl<T> Downcast for Twhere T: Any,

fn into_any(self: Box<T>) -> Box<dyn Any>

fn into_any_rc(self: Rc<T>) -> Rc<dyn Any>

fn as_any(&self) -> &(dyn Any + 'static)

fn as_any_mut(&mut self) -> &mut (dyn Any + 'static)

impl<T> DowncastSync for Twhere T: Any + Send + Sync,

fn into_any_arc(self: Arc<T>) -> Arc<dyn Any + Sync + Send> ⓘ

impl<T> From<T> for T

fn from(t: T) -> T

impl<T> FutureExt for T

fn with_context(self, otel_cx: Context) -> WithContext<Self>

fn with_current_context(self) -> WithContext<Self>

impl<T> Instrument for T

fn instrument(self, span: Span) -> Instrumented<Self>

fn in_current_span(self) -> Instrumented<Self>

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<T> Paint for Twhere T: ?Sized,

fn fg(&self, value: Color) -> Painted<&T>

§Example

fn primary(&self) -> Painted<&T>

§Example

fn fixed(&self, color: u8) -> Painted<&T>

§Example

fn rgb(&self, r: u8, g: u8, b: u8) -> Painted<&T>

§Example

fn black(&self) -> Painted<&T>

§Example

fn red(&self) -> Painted<&T>

§Example

fn green(&self) -> Painted<&T>

§Example

fn yellow(&self) -> Painted<&T>

§Example

fn blue(&self) -> Painted<&T>

§Example

fn magenta(&self) -> Painted<&T>

§Example

fn cyan(&self) -> Painted<&T>

§Example

fn white(&self) -> Painted<&T>

Struct EvalCodingAgentArgs

impl<T> Any for T
where T: 'static + ?Sized,

impl<'a, T, E> AsTaggedExplicit<'a, E> for T
where T: 'a,

impl<'a, T, E> AsTaggedImplicit<'a, E> for T
where T: 'a,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<ST, DT> CastableFrom<ST, Initialized, Initialized> for DT
where ST: ?Sized, DT: ?Sized,

impl<ST, DT> CastableFrom<ST, Uninit, Uninit> for DT
where ST: ?Sized, DT: ?Sized,

impl<T> Downcast for T
where T: Any,

impl<T> DowncastSync for T
where T: Any + Send + Sync,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<T> Paint for T
where T: ?Sized,