1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use uuid::Uuid;
/// Extraction strategy for getting content from a page
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "strategy", rename_all = "snake_case")]
pub enum Extraction {
/// Auto-detect main content using readability heuristics
Auto,
/// Use a specific CSS selector
Selector { selector: String },
/// Use entire page body
Full,
/// Extract specific meta tags
Meta { tags: Vec<String> },
/// RSS/Atom feed - use pre-formatted item text
Rss,
/// Extract JSON-LD structured data (schema.org)
JsonLd {
/// Optional filter for specific @type(s) like "Product", "Article"
#[serde(default)]
types: Option<Vec<String>>,
},
}
impl Default for Extraction {
fn default() -> Self {
Self::Auto
}
}
/// Normalization options applied before hashing
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct Normalization {
/// Strip whitespace variations (default: true)
#[serde(default = "default_true")]
pub strip_whitespace: bool,
/// Strip timestamps and dates
#[serde(default)]
pub strip_dates: bool,
/// Strip random IDs or cache-busters
#[serde(default)]
pub strip_random_ids: bool,
/// Custom regex patterns to ignore
#[serde(default)]
pub ignore_patterns: Vec<String>,
}
fn default_true() -> bool {
true
}
/// Filter target - what the filter operates on
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum FilterTarget {
/// Match against new content
New,
/// Match against the diff text
Diff,
/// Match against old content
Old,
}
/// A filter rule applied after detecting a change
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Filter {
/// What to operate on
pub on: FilterTarget,
/// Contains this text
#[serde(skip_serializing_if = "Option::is_none")]
pub contains: Option<String>,
/// Does not contain this text
#[serde(skip_serializing_if = "Option::is_none")]
pub not_contains: Option<String>,
/// Matches this regex pattern
#[serde(skip_serializing_if = "Option::is_none")]
pub matches: Option<String>,
/// Diff size greater than N chars
#[serde(skip_serializing_if = "Option::is_none")]
pub size_gt: Option<usize>,
}
/// Fetch engine to use
#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum Engine {
#[default]
Http,
Playwright,
/// RSS/Atom feed parsing
Rss,
/// Shell command output
Shell {
command: String,
},
}
/// Agent configuration for AI-powered change analysis
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AgentConfig {
/// Whether the agent is enabled
#[serde(default = "default_true")]
pub enabled: bool,
/// Custom prompt template (uses default if None)
#[serde(skip_serializing_if = "Option::is_none")]
pub prompt_template: Option<String>,
/// Custom instructions for this specific watch
#[serde(skip_serializing_if = "Option::is_none")]
pub instructions: Option<String>,
}
/// A watch definition
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Watch {
/// Unique identifier
pub id: Uuid,
/// User-friendly name
pub name: String,
/// URL to monitor
pub url: String,
/// Fetch engine to use
#[serde(default)]
pub engine: Engine,
/// Extraction strategy
#[serde(default)]
pub extraction: Extraction,
/// Normalization options
#[serde(default)]
pub normalization: Normalization,
/// Filter rules (empty = any change triggers notification)
#[serde(default)]
pub filters: Vec<Filter>,
/// Agent configuration (None = no agent)
#[serde(skip_serializing_if = "Option::is_none")]
pub agent_config: Option<AgentConfig>,
/// Check interval in seconds
pub interval_secs: u64,
/// Whether the watch is enabled
#[serde(default = "default_true")]
pub enabled: bool,
/// When the watch was created
pub created_at: DateTime<Utc>,
/// Custom headers for authenticated requests
#[serde(default, skip_serializing_if = "std::collections::HashMap::is_empty")]
pub headers: std::collections::HashMap<String, String>,
/// Path to cookie file (Netscape format)
#[serde(skip_serializing_if = "Option::is_none")]
pub cookie_file: Option<String>,
/// Playwright storage state file for authenticated sessions
#[serde(skip_serializing_if = "Option::is_none")]
pub storage_state: Option<String>,
/// Per-watch notification target (overrides global default)
#[serde(skip_serializing_if = "Option::is_none")]
pub notify_target: Option<crate::config::NotifyTarget>,
/// Tags for organization and filtering
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub tags: Vec<String>,
/// Whether to include the user's interest profile in AI analysis
#[serde(default)]
pub use_profile: bool,
}
impl Watch {
/// Create a new watch with defaults
pub fn new(name: String, url: String) -> Self {
Self {
id: Uuid::new_v4(),
name,
url,
engine: Engine::default(),
extraction: Extraction::default(),
normalization: Normalization::default(),
filters: Vec::new(),
agent_config: None,
interval_secs: 900, // 15 minutes
enabled: true,
created_at: Utc::now(),
headers: std::collections::HashMap::new(),
cookie_file: None,
storage_state: None,
notify_target: None,
tags: Vec::new(),
use_profile: false, // Opt-in per watch
}
}
}
/// A snapshot of page content at a point in time
#[derive(Debug, Clone)]
pub struct Snapshot {
pub id: Uuid,
pub watch_id: Uuid,
pub fetched_at: DateTime<Utc>,
/// Raw HTML (zstd compressed), only kept for last 5
pub raw_html: Option<Vec<u8>>,
/// Extracted and normalized content
pub extracted: String,
/// SHA-256 hash of extracted content
pub content_hash: String,
}
/// A detected change between snapshots
#[derive(Debug, Clone, Serialize)]
pub struct Change {
pub id: Uuid,
pub watch_id: Uuid,
pub detected_at: DateTime<Utc>,
pub old_snapshot_id: Uuid,
pub new_snapshot_id: Uuid,
/// The diff text
pub diff: String,
/// Whether filters passed
pub filter_passed: bool,
/// Agent response if agent was invoked
pub agent_response: Option<serde_json::Value>,
/// Whether notification was sent
pub notified: bool,
}
/// Agent memory stored per-watch
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct AgentMemory {
#[serde(default)]
pub counters: std::collections::HashMap<String, i64>,
/// Flexible value storage - accepts any JSON type (string, number, bool, etc.)
#[serde(default)]
pub last_values: std::collections::HashMap<String, serde_json::Value>,
#[serde(default)]
pub notes: Vec<String>,
}
impl AgentMemory {
/// Maximum size in bytes (16KB)
pub const MAX_SIZE: usize = 16 * 1024;
/// Serialize to JSON string
pub fn to_json(&self) -> Result<String, serde_json::Error> {
serde_json::to_string(self)
}
/// Parse from JSON string
pub fn from_json(json: &str) -> Result<Self, serde_json::Error> {
serde_json::from_str(json)
}
/// Check if memory exceeds size limit
pub fn is_over_limit(&self) -> bool {
self.to_json().map(|s| s.len() > Self::MAX_SIZE).unwrap_or(true)
}
/// Truncate oldest notes until under limit, also expire notes older than 7 days
pub fn truncate_to_limit(&mut self) {
use chrono::{Duration, Utc};
// First, expire notes older than 7 days
let cutoff = Utc::now() - Duration::days(7);
self.notes.retain(|note| {
// Try to extract timestamp from note (format: "2026-01-15T03:43:33" or "CRITICAL: 2026-01-15T03:43:33")
let timestamp_str = if note.starts_with("CRITICAL:") || note.starts_with("WARNING:") {
// Extract after the prefix
note.split_whitespace().nth(1)
} else {
// First word is the timestamp
note.split(':').next().and_then(|s| s.split_whitespace().next())
};
if let Some(ts) = timestamp_str {
// Try to parse as ISO 8601 timestamp
if let Ok(note_time) = chrono::DateTime::parse_from_rfc3339(ts) {
return note_time > cutoff;
}
// Also try parsing just the date portion with time
if let Ok(note_time) = chrono::NaiveDateTime::parse_from_str(ts, "%Y-%m-%dT%H:%M:%S") {
let note_utc = chrono::DateTime::<Utc>::from_naive_utc_and_offset(note_time, Utc);
return note_utc > cutoff;
}
}
// If we can't parse the timestamp, keep the note
true
});
// Then truncate if still over limit
while self.is_over_limit() && !self.notes.is_empty() {
self.notes.remove(0);
}
}
}
/// A simple reminder notification (not a web watcher)
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Reminder {
pub id: Uuid,
pub name: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub message: Option<String>,
/// When to trigger (Unix timestamp)
pub trigger_at: DateTime<Utc>,
/// Interval for recurring reminders (None = one-shot)
#[serde(skip_serializing_if = "Option::is_none")]
pub interval_secs: Option<u64>,
pub enabled: bool,
/// Per-reminder notification target override
#[serde(skip_serializing_if = "Option::is_none")]
pub notify_target: Option<crate::config::NotifyTarget>,
pub created_at: DateTime<Utc>,
}