valknut-rs 1.3.1

High-performance Rust implementation of valknut code analysis algorithms
Documentation
# Valknut configuration example
# Copy to `valknut.yml` (or pass with `--config`) and adjust for your repository.
# All sections shown here map directly to `ValknutConfig` in the Rust CLI.

analysis:
  modules:
    complexity: true
    dependencies: true
    duplicates: true
    refactoring: true
    structure: true
    coverage: true
  languages:
    enabled:
      - python
      - javascript
      - typescript
      - rust
      - go
    max_file_size_mb: 10.0
    complexity_thresholds:
      python: 10.0
      javascript: 10.0
      typescript: 10.0
      rust: 15.0
      go: 12.0
  files:
    include_patterns:
      - "**/*"
    exclude_patterns:
      - "*/node_modules/*"
      - "*/venv/*"
      - "*/target/*"
      - "*/__pycache__/*"
      - "*.min.js"
    max_files: null              # null = no limit
    follow_symlinks: false
  quality:
    confidence_threshold: 0.7
    max_analysis_time_per_file: 30
    strict_mode: false
  coverage:
    enabled: true
    file_path: null
    auto_discover: true
    max_age_days: 7
    search_paths:
      - "./coverage/"
      - "./target/coverage/"
      - "./target/tarpaulin/"
      - "./.coverage/"
      - "./htmlcov/"
      - "./build/coverage/"
      - "./reports/"
      - "./"

denoise:
  enabled: true
  auto: true
  min_function_tokens: 40
  min_match_tokens: 24
  require_blocks: 2
  similarity: 0.82
  weights:
    ast: 0.35
    pdg: 0.45
    emb: 0.20
  io_mismatch_penalty: 0.25
  threshold_s: 0.82
  stop_motifs:
    enabled: true
    percentile: 0.5
    refresh_days: 7
  auto_calibration:
    enabled: true
    quality_target: 0.8
    sample_size: 200
    max_iterations: 50
  ranking:
    by: "saved_tokens"
    min_saved_tokens: 100
    min_rarity_gain: 1.2
  dry_run: false

scoring:
  normalization_scheme: "z_score"
  use_bayesian_fallbacks: true
  confidence_reporting: false
  weights:
    complexity: 1.0
    graph: 0.8
    structure: 0.9
    style: 0.5
    coverage: 0.7
  statistical_params:
    confidence_level: 0.95
    min_sample_size: 10
    outlier_threshold: 3.0

graph:
  enable_betweenness: true
  enable_closeness: false
  enable_cycle_detection: true
  max_exact_size: 10000
  use_approximation: true
  approximation_sample_rate: 0.1

lsh:
  num_hashes: 128
  num_bands: 16
  shingle_size: 3
  similarity_threshold: 0.7
  max_candidates: 100
  use_semantic_similarity: false

dedupe:
  include:
    - "src/**"
  exclude:
    - "benchmarks/**"
    - "examples/**"
    - "datasets/**"
    - "**/generated/**"
    - "**/*.pb.rs"
  min_function_tokens: 40
  min_ast_nodes: 35
  min_match_tokens: 24
  min_match_coverage: 0.40
  shingle_k: 9
  require_distinct_blocks: 2
  weights:
    ast: 0.35
    pdg: 0.45
    emb: 0.20
  io_mismatch_penalty: 0.25
  threshold_s: 0.82
  stop_phrases:
    - "^\\s*@staticmethod\\b"
    - "group\\.bench_with_input\\s*\\("
    - "\\bb\\.iter\\s*\\(\\|\\|"
    - "\\bgroup\\.finish\\s*\\(\\)\\s*;?"
    - "\\blet\\s+config\\s*=\\s*AnalysisConfig::(new|default)\\s*\\(\\)\\s*;?"
    - "\\bchecks\\.push\\s*\\(\\s*HealthCheck\\s*\\{"
  rank_by: "saved_tokens"
  min_saved_tokens: 100
  keep_top_per_file: 3
  adaptive:
    auto_denoise: true
    adaptive_learning: true
    rarity_weighting: true
    structural_validation: true
    stop_motif_percentile: 0.75
    hub_suppression_threshold: 0.6
    quality_gate_percentage: 0.8
    tfidf_kgram_size: 8
    wl_iterations: 3
    min_rarity_gain: 1.2
    external_call_jaccard_threshold: 0.2
    cache_refresh_days: 7
    auto_refresh_cache: true

languages:
  python:
    enabled: true
    file_extensions: [".py", ".pyi"]
    tree_sitter_language: "python"
    max_file_size_mb: 10.0
    complexity_threshold: 10.0
    additional_settings: {}
  javascript:
    enabled: true
    file_extensions: [".js", ".mjs", ".jsx"]
    tree_sitter_language: "javascript"
    max_file_size_mb: 5.0
    complexity_threshold: 10.0
    additional_settings: {}
  typescript:
    enabled: true
    file_extensions: [".ts", ".tsx", ".d.ts"]
    tree_sitter_language: "typescript"
    max_file_size_mb: 5.0
    complexity_threshold: 10.0
    additional_settings: {}
  rust:
    enabled: true
    file_extensions: [".rs"]
    tree_sitter_language: "rust"
    max_file_size_mb: 10.0
    complexity_threshold: 15.0
    additional_settings: {}
  go:
    enabled: true
    file_extensions: [".go"]
    tree_sitter_language: "go"
    max_file_size_mb: 8.0
    complexity_threshold: 12.0
    additional_settings: {}

io:
  cache_dir: ".valknut/cache"
  enable_caching: true
  cache_ttl_seconds: 3600
  report_dir: "./reports"
  report_format: "json"

performance:
  max_threads: null             # null = auto-detect
  memory_limit_mb: null         # null = no limit
  file_timeout_seconds: 30
  total_timeout_seconds: null
  enable_simd: false
  batch_size: 100

structure:
  enable_branch_packs: true
  enable_file_split_packs: true
  top_packs: 20
  fsdir:
    max_files_per_dir: 25
    max_subdirs_per_dir: 10
    max_dir_loc: 2000
    min_branch_recommendation_gain: 0.15
    min_files_for_split: 5
    target_loc_per_subdir: 1000
  fsfile:
    huge_loc: 800
    huge_bytes: 128000
    min_split_loc: 200
    min_entities_per_split: 3
  partitioning:
    balance_tolerance: 0.25
    max_clusters: 4
    min_clusters: 2
    naming_fallbacks: ["core", "io", "api", "util"]

coverage:
  auto_discover: true
  search_paths:
    - "./coverage/"
    - "./target/coverage/"
    - "./target/tarpaulin/"
    - "./.coverage/"
    - "./htmlcov/"
    - "./build/coverage/"
    - "./reports/"
    - "./"
  file_patterns:
    - "coverage.xml"
    - "lcov.info"
    - "coverage.json"
    - "coverage.lcov"
    - "cobertura.xml"
    - "**/coverage.xml"
    - "**/lcov.info"
    - "**/coverage.json"
    - "**/cobertura.xml"
  max_age_days: 7
  coverage_file: null           # Set to override auto-discovery