adler-core 0.14.0

Core engine for the Adler OSINT username-search tool.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
//! Watchlist configuration model.
//!
//! This module is intentionally runtime-free: it models who should be
//! watched and which registry scope each watched identity uses. CLI,
//! Web, MCP, or a future scheduler can parse JSON/TOML/YAML into these
//! serde-compatible structs and then call [`WatchlistConfig::scan_targets`]
//! to get concrete `(username, SiteFilter)` work items.

use std::collections::HashSet;

use serde::{Deserialize, Serialize};

use crate::{SiteFilter, Username};

/// Current schema version for watchlist configuration documents.
pub const WATCHLIST_CONFIG_SCHEMA_VERSION: u16 = 1;

/// Top-level watchlist configuration.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct WatchlistConfig {
    /// Schema version for tolerant future readers.
    #[serde(default = "default_schema_version")]
    pub schema_version: u16,
    /// Optional default scan scope inherited by every target.
    #[serde(default)]
    pub default_scope: WatchScope,
    /// Optional repeated-scan policy. Runtime surfaces decide how to execute it.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub schedule: Option<ScanSchedule>,
    /// Watched identities.
    #[serde(default, skip_serializing_if = "Vec::is_empty")]
    pub targets: Vec<WatchTarget>,
}

impl Default for WatchlistConfig {
    fn default() -> Self {
        Self {
            schema_version: WATCHLIST_CONFIG_SCHEMA_VERSION,
            default_scope: WatchScope::default(),
            schedule: None,
            targets: Vec::new(),
        }
    }
}

impl WatchlistConfig {
    /// Validate targets, aliases, and duplicate scan usernames.
    ///
    /// The same concrete username appearing twice is rejected even if it
    /// arrives through aliases, because a later timeline cannot safely decide
    /// which watched identity owns that scan artifact.
    pub fn validate(&self) -> Result<(), WatchlistError> {
        if let Some(schedule) = &self.schedule {
            schedule.validate()?;
        }

        let mut seen = HashSet::new();
        for (index, target) in self.targets.iter().enumerate() {
            if target.username.trim().is_empty() {
                return Err(WatchlistError::EmptyUsername {
                    target_index: index,
                });
            }
            validate_username(&target.username)?;
            insert_unique(&mut seen, &target.username)?;
            for alias in &target.aliases {
                if alias.trim().is_empty() {
                    return Err(WatchlistError::EmptyAlias {
                        username: target.username.clone(),
                    });
                }
                validate_username(alias)?;
                insert_unique(&mut seen, alias)?;
            }
        }
        Ok(())
    }

    /// Expand watched identities into concrete scan targets.
    ///
    /// Each target yields one scan for its primary username and one scan per
    /// alias. The returned scope is the merged default + per-target scope.
    pub fn scan_targets(&self) -> Result<Vec<WatchScanTarget>, WatchlistError> {
        self.validate()?;
        let mut out = Vec::new();
        for target in &self.targets {
            let scope = self.default_scope.merged(&target.scope).to_site_filter();
            out.push(WatchScanTarget {
                identity: target.username.clone(),
                username: target.username.clone(),
                scope: scope.clone(),
            });
            for alias in &target.aliases {
                out.push(WatchScanTarget {
                    identity: target.username.clone(),
                    username: alias.clone(),
                    scope: scope.clone(),
                });
            }
        }
        Ok(out)
    }
}

const fn default_schema_version() -> u16 {
    WATCHLIST_CONFIG_SCHEMA_VERSION
}

/// Repeated scan policy for a watchlist.
///
/// This type intentionally contains no timers, tasks, or async runtime hooks.
/// A caller can persist the last started scan timestamp, ask whether a plan is
/// due, and then launch scans using its own scheduler.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct ScanSchedule {
    /// Seconds between repeated scans. Must be greater than zero.
    pub every_secs: u64,
    /// Optional Unix epoch millisecond timestamp before which the plan is not
    /// due. Omit for an immediately due first scan.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub start_at_ms: Option<u64>,
}

impl ScanSchedule {
    /// Validate interval bounds.
    pub fn validate(&self) -> Result<(), WatchlistError> {
        if self.every_secs == 0 {
            return Err(WatchlistError::InvalidSchedule {
                reason: "every_secs must be greater than zero".to_owned(),
            });
        }
        if self.every_secs > u64::MAX / 1_000 {
            return Err(WatchlistError::InvalidSchedule {
                reason: "every_secs is too large to convert to milliseconds".to_owned(),
            });
        }
        Ok(())
    }

    /// Millisecond timestamp when the next scan is due.
    ///
    /// `last_started_at_ms` is the timestamp of the previous scan start for
    /// this schedule. `None` means the first scan has not run yet.
    #[must_use]
    pub fn next_due_ms(&self, last_started_at_ms: Option<u64>) -> u64 {
        let interval_ms = self.every_secs.saturating_mul(1_000);
        let due_after_last = last_started_at_ms.map(|last| last.saturating_add(interval_ms));
        match (due_after_last, self.start_at_ms) {
            (Some(due), Some(start_at)) => due.max(start_at),
            (Some(due), None) => due,
            (None, Some(start_at)) => start_at,
            (None, None) => 0,
        }
    }

    /// Whether the schedule is due at `now_ms`.
    #[must_use]
    pub fn is_due(&self, last_started_at_ms: Option<u64>, now_ms: u64) -> bool {
        self.next_due_ms(last_started_at_ms) <= now_ms
    }
}

/// One watched identity.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct WatchTarget {
    /// Primary username / handle.
    pub username: String,
    /// Additional handles that should be tracked as the same identity.
    #[serde(default, skip_serializing_if = "Vec::is_empty")]
    pub aliases: Vec<String>,
    /// Optional scope overriding or extending the watchlist default.
    #[serde(default)]
    pub scope: WatchScope,
}

/// Site/tag scope for watchlist scans.
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
pub struct WatchScope {
    /// Keep only sites whose name contains at least one term.
    #[serde(default, skip_serializing_if = "Vec::is_empty")]
    pub only: Vec<String>,
    /// Drop sites whose name contains any term.
    #[serde(default, skip_serializing_if = "Vec::is_empty")]
    pub exclude: Vec<String>,
    /// Keep only sites carrying at least one requested tag.
    #[serde(default, skip_serializing_if = "Vec::is_empty")]
    pub tag: Vec<String>,
    /// Drop sites carrying any of these tags.
    #[serde(default, skip_serializing_if = "Vec::is_empty")]
    pub exclude_tag: Vec<String>,
    /// Include `nsfw`-tagged sites. `None` means inherit the default scope.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub include_nsfw: Option<bool>,
    /// Optional popularity-rank ceiling.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub top: Option<u32>,
}

impl WatchScope {
    /// Merge a default scope with an overriding per-target scope.
    ///
    /// List fields are appended so a target can narrow a default scope with
    /// extra include/exclude terms. Scalar fields override only when set.
    #[must_use]
    pub fn merged(&self, override_scope: &Self) -> Self {
        let mut merged = self.clone();
        merged.only.extend(override_scope.only.clone());
        merged.exclude.extend(override_scope.exclude.clone());
        merged.tag.extend(override_scope.tag.clone());
        merged
            .exclude_tag
            .extend(override_scope.exclude_tag.clone());
        if override_scope.include_nsfw.is_some() {
            merged.include_nsfw = override_scope.include_nsfw;
        }
        if override_scope.top.is_some() {
            merged.top = override_scope.top;
        }
        merged
    }

    /// Convert into the core registry filter.
    #[must_use]
    pub fn to_site_filter(&self) -> SiteFilter {
        SiteFilter {
            include: self.only.clone(),
            exclude: self.exclude.clone(),
            tags: self.tag.clone(),
            exclude_tags: self.exclude_tag.clone(),
            include_nsfw: self.include_nsfw.unwrap_or(false),
            top: self.top,
        }
    }
}

/// Concrete scan work item derived from a watchlist.
#[derive(Debug, Clone)]
pub struct WatchScanTarget {
    /// Primary watched identity this scan contributes to.
    pub identity: String,
    /// Username/alias to scan.
    pub username: String,
    /// Registry scope for this scan.
    pub scope: SiteFilter,
}

/// Watchlist validation error.
#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)]
pub enum WatchlistError {
    /// Target had an empty username.
    #[error("watch target at index {target_index} has an empty username")]
    EmptyUsername {
        /// Index in [`WatchlistConfig::targets`].
        target_index: usize,
    },
    /// Alias was empty.
    #[error("watch target {username:?} has an empty alias")]
    EmptyAlias {
        /// Primary username carrying the empty alias.
        username: String,
    },
    /// Username failed core validation.
    #[error("invalid username {username:?}: {reason}")]
    InvalidUsername {
        /// Username or alias that failed validation.
        username: String,
        /// Human-readable validation reason.
        reason: String,
    },
    /// Same concrete username appeared more than once.
    #[error("duplicate watch username or alias {username:?}")]
    DuplicateUsername {
        /// Duplicated username/alias.
        username: String,
    },
    /// Schedule policy is invalid.
    #[error("invalid watch schedule: {reason}")]
    InvalidSchedule {
        /// Human-readable validation reason.
        reason: String,
    },
}

fn validate_username(username: &str) -> Result<(), WatchlistError> {
    Username::new(username.to_owned()).map_err(|err| WatchlistError::InvalidUsername {
        username: username.to_owned(),
        reason: err.to_string(),
    })?;
    Ok(())
}

fn insert_unique(seen: &mut HashSet<String>, username: &str) -> Result<(), WatchlistError> {
    let key = username.to_ascii_lowercase();
    if !seen.insert(key) {
        return Err(WatchlistError::DuplicateUsername {
            username: username.to_owned(),
        });
    }
    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn defaults_schema_version_and_empty_targets() {
        let cfg: WatchlistConfig = serde_json::from_str("{}").unwrap();

        assert_eq!(cfg.schema_version, WATCHLIST_CONFIG_SCHEMA_VERSION);
        assert!(cfg.targets.is_empty());
        assert!(cfg.validate().is_ok());
    }

    #[test]
    fn serializes_compact_scope() {
        let cfg = WatchlistConfig {
            default_scope: WatchScope {
                tag: vec!["social".into()],
                exclude_tag: vec!["bot-protected".into()],
                top: Some(100),
                ..WatchScope::default()
            },
            schedule: Some(ScanSchedule {
                every_secs: 86_400,
                start_at_ms: Some(1_800_000_000_000),
            }),
            targets: vec![WatchTarget {
                username: "alice".into(),
                aliases: vec!["alice_dev".into()],
                scope: WatchScope::default(),
            }],
            ..WatchlistConfig::default()
        };

        let json = serde_json::to_value(&cfg).unwrap();
        assert_eq!(json["schema_version"], WATCHLIST_CONFIG_SCHEMA_VERSION);
        assert_eq!(json["default_scope"]["tag"][0], "social");
        assert_eq!(json["schedule"]["every_secs"], 86_400);
        assert_eq!(json["targets"][0]["username"], "alice");
        assert_eq!(json["targets"][0]["aliases"][0], "alice_dev");
        assert!(json["targets"][0].get("scope").is_some());
    }

    #[test]
    fn schedule_is_due_immediately_without_start_or_previous_run() {
        let schedule = ScanSchedule {
            every_secs: 60,
            start_at_ms: None,
        };

        assert_eq!(schedule.next_due_ms(None), 0);
        assert!(schedule.is_due(None, 1));
    }

    #[test]
    fn schedule_uses_start_and_last_run_for_next_due() {
        let schedule = ScanSchedule {
            every_secs: 60,
            start_at_ms: Some(10_000),
        };

        assert_eq!(schedule.next_due_ms(None), 10_000);
        assert_eq!(schedule.next_due_ms(Some(12_000)), 72_000);
        assert!(!schedule.is_due(Some(12_000), 71_999));
        assert!(schedule.is_due(Some(12_000), 72_000));
    }

    #[test]
    fn validate_rejects_zero_schedule_interval() {
        let cfg = WatchlistConfig {
            schedule: Some(ScanSchedule {
                every_secs: 0,
                start_at_ms: None,
            }),
            ..WatchlistConfig::default()
        };

        let err = cfg.validate().unwrap_err();
        assert!(matches!(err, WatchlistError::InvalidSchedule { .. }));
    }

    #[test]
    fn expands_aliases_with_merged_scope() {
        let cfg = WatchlistConfig {
            default_scope: WatchScope {
                tag: vec!["social".into()],
                exclude_tag: vec!["bot-protected".into()],
                top: Some(500),
                ..WatchScope::default()
            },
            targets: vec![WatchTarget {
                username: "alice".into(),
                aliases: vec!["alice_dev".into(), "alice-osint".into()],
                scope: WatchScope {
                    only: vec!["Git".into()],
                    tag: vec!["dev".into()],
                    top: Some(50),
                    ..WatchScope::default()
                },
            }],
            ..WatchlistConfig::default()
        };

        let targets = cfg.scan_targets().unwrap();

        assert_eq!(targets.len(), 3);
        assert_eq!(targets[0].identity, "alice");
        assert_eq!(targets[1].username, "alice_dev");
        assert_eq!(targets[2].username, "alice-osint");
        assert_eq!(targets[0].scope.include, ["Git"]);
        assert_eq!(targets[0].scope.tags, ["social", "dev"]);
        assert_eq!(targets[0].scope.exclude_tags, ["bot-protected"]);
        assert_eq!(targets[0].scope.top, Some(50));
    }

    #[test]
    fn rejects_duplicate_aliases_case_insensitively() {
        let cfg = WatchlistConfig {
            targets: vec![WatchTarget {
                username: "alice".into(),
                aliases: vec!["Alice".into()],
                scope: WatchScope::default(),
            }],
            ..WatchlistConfig::default()
        };

        let err = cfg.validate().unwrap_err();
        assert!(matches!(err, WatchlistError::DuplicateUsername { .. }));
    }

    #[test]
    fn rejects_invalid_alias_username() {
        let cfg = WatchlistConfig {
            targets: vec![WatchTarget {
                username: "alice".into(),
                aliases: vec!["bad space".into()],
                scope: WatchScope::default(),
            }],
            ..WatchlistConfig::default()
        };

        let err = cfg.validate().unwrap_err();
        assert!(matches!(err, WatchlistError::InvalidUsername { .. }));
    }
}