1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# Production Task Queue Configuration
# This configuration is optimized for production environments with
# security, performance, and monitoring best practices
#
# ENHANCED AUTO-SCALING:
# Features multidimensional scaling metrics, adaptive threshold learning,
# and stability controls for intelligent worker management
[]
# Redis connection with SSL/TLS support for production
= "rediss://redis-cluster.production.com:6380"
= 20
= 10 # seconds
= 5 # seconds
[]
# Optimized for production workloads
= 10
= 25
= 30 # seconds
= 120 # seconds
[]
= true
= 5
= 50
= 2 # workers to add when scaling up
= 1 # workers to remove when scaling down
# Multidimensional scaling triggers (Enhanced Auto-scaling)
# The system now considers 5 different metrics simultaneously for scaling decisions:
# - Queue pressure: weighted queue depth accounting for priority queues
# - Worker utilization: actual busy/idle ratio of workers
# - Task complexity: analysis of task execution patterns
# - Error rate: system health indicator
# - Memory pressure: resource utilization per worker
[]
= 1.2 # weighted queue depth per worker
= 0.85 # target worker utilization (85%)
= 2.0 # complex task overload factor
= 0.03 # maximum 3% error rate
= 1024.0 # memory usage per worker (MB)
# Adaptive threshold learning (SLA-driven optimization)
= true
= 0.05 # conservative learning for production
= 60 # longer window for stability
# Hysteresis and stability controls
= 180 # 3 minutes between scale-ups
= 900 # 15 minutes between scale-downs
= 3 # require 3 consecutive signals
# SLA performance targets for adaptive learning
# When adaptive thresholds are enabled, the system automatically adjusts
# scaling triggers based on actual performance vs these SLA targets
[]
= 3000.0 # 3 second P95 latency target
= 0.99 # 99% success rate target
= 5000.0 # 5 second max queue wait
= 0.75 # optimal 75% worker utilization
[]
= true
= 30 # seconds
= 100
[]
= true
[]
# Enhanced production monitoring
= true
= 60 # seconds
= 30
= true
# Enhanced auto-scaling metrics collection
= true
= true
= true
= 30 # seconds between scaling metric snapshots
[]
# Production alert thresholds
= 10000
= 0.05 # 5%
= 300000 # 5 minutes
= 2048 # 2GB
= 600 # 10 minutes
# Enhanced auto-scaling monitoring thresholds
= 2.0 # alert if queue pressure exceeds 2.0
= 0.10 # alert if workers severely underutilized
= 0.95 # alert if workers overloaded
= 10 # alert if scaling too frequently
= 0.98 # alert if SLA success rate drops
= 5000 # alert if P95 latency exceeds 5s
[]
# Security configurations
= 16
= true
= ["^[a-zA-Z0-9_-]+$"]
= true
= 1000
[]
# Structured logging for production
= "info"
= "json"
= true
= false
[]
# Health check endpoints configuration
= true
= 8080
= "/health"
= false
= 5000
[]
# Production web server settings
= true
= "/api/v1/tasks"
= true
= true
[]
# Production web server settings
= true
= "/api/v1/tasks"
= true
= true