// patterns.rhai — curated regex helpers for Kelora pipelines.
//
// Provides functions for detecting and extracting common patterns from log text:
// - has_pattern(text, name) → bool
// - extract_pattern(text, name) → string (first match)
// - extract_patterns(text, name) → array (all matches)
// - emit_patterns(text, name, key) → emits events for each match
// - pattern_names() → array of available pattern names
//
// Usage examples:
// # Detect URLs in messages
// kelora --include examples/patterns.rhai -f json logs.jsonl \
// --exec 'if has_pattern(e.message, "url") { e } else { () }'
//
// # Extract all email addresses
// kelora --include examples/patterns.rhai -f json logs.jsonl \
// --exec 'let emails = extract_patterns(e.message, "email"); print(emails)'
//
// # Emit separate events for each IPv4 address found
// kelora --include examples/patterns.rhai -f json logs.jsonl \
// --exec 'emit_patterns(e.message, "ipv4", "ip_address")'
//
// # List all available patterns
// kelora --include examples/patterns.rhai -f json logs.jsonl \
// --exec 'print(pattern_names())'
fn _get_pattern_defs() {
#{
duration: [
#{ re: #"\b\d+(?:\.\d+)?(?:ns|us|ms|s|m|h)\b"#, desc: "Short units like 12ms or 500us" },
#{ re: #"(?i)\b\d+(?:\.\d+)?\s*(?:microsecond|millisecond|second|minute|hour|day|week|month|year)s?\b"#, desc: "Verbose durations such as 3 minutes" },
#{ re: #"\b\d+h\d+m(?:\d+s)?\b"#, desc: "Compact combos like 1h30m15s" },
#{ re: #"\b\d+m\d+s\b"#, desc: "Minute+second combos like 7m30s" },
],
email: [
#{ re: #"(?i)\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}\b"#, desc: "Email addresses" },
],
error_token: [
#{ re: #"(?i)\b(?:error|err|panic|critical|crit|alert|fatal|emerg|failed|failure|exception|abort|severe)\b"#, desc: "Error-ish keywords" },
],
fail_signal: [
#{ re: #"(?i)\b(?:err(?:or)?|fail(?:ure|ed|ing)?|den(?:y|ied)|invalid|time(?:out|d\s*out|-?\s*out)|timout|exception|blocked|expir(?:ed|ing|ation|e)?|reject(?:ed|ing|ion)?|unauthoriz(?:ed|ation|e)?|unauth|forbidden|corrupt(?:ed|ion)?|malform(?:ed|ation)?|disconnect(?:ed|ion)?|unreachable|violat(?:ed|ion|e)?|blacklist(?:ed|ing)?|crash(?:ed|ing)?|abort(?:ed|ing)?|panic|crit(?:ical)?|alert|fatal|emerg(?:ency)?)\b"#, desc: "Failure / denial keywords" },
],
fqdn: [
#{ re: #"(?i)\b(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+(?:[a-z]{2,63})\b"#, desc: "host.example.com" },
],
function_call: [
#{ re: #"\b[\w.]+\([^()\n]*\)"#, desc: "function(args) style call" },
],
git_commit: [
#{ re: #"(?i)\b[0-9a-f]{7,40}\b"#, desc: "Git commit hashes" },
],
hex_color: [
#{ re: #"#(?:[0-9a-fA-F]{3}){1,2}\b"#, desc: "#1A2B3C or #abc" },
],
hex_number: [
#{ re: #"\b0x[0-9a-fA-F]+\b"#, desc: "0xdeadbeef" },
],
ipv4: [
#{ re: #"\b(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}\b"#, desc: "IPv4 address" },
],
ipv4_port: [
#{ re: #"\b(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}:(?:6553[0-5]|655[0-2]\d|65[0-4]\d{2}|6[0-4]\d{3}|[1-5]\d{4}|[1-9]\d{0,3}|0)\b"#, desc: "IPv4 with TCP/UDP port" },
],
ipv6: [
#{ re: ##"(?xi)\b(?:
(?:[0-9a-f]{1,4}:){7}[0-9a-f]{1,4}|
(?:[0-9a-f]{1,4}:){1,7}:|
:(?:[0-9a-f]{1,4}:){1,7}|
(?:[0-9a-f]{1,4}:){1,6}:[0-9a-f]{1,4}|
(?:[0-9a-f]{1,4}:){1,5}(?::[0-9a-f]{1,4}){1,2}|
(?:[0-9a-f]{1,4}:){1,4}(?::[0-9a-f]{1,4}){1,3}|
(?:[0-9a-f]{1,4}:){1,3}(?::[0-9a-f]{1,4}){1,4}|
(?:[0-9a-f]{1,4}:){1,2}(?::[0-9a-f]{1,4}){1,5}|
[0-9a-f]{1,4}:(?::[0-9a-f]{1,4}){1,6}|
::(?:ffff(?::0{1,4}){0,1}:)?(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}|
(?:[0-9a-f]{1,4}:){1,4}:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}|
::
)\b"##, desc: "IPv6 address (standard and compressed)" },
],
iso_timestamp: [
#{ re: #"\b\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?\b"#, desc: "ISO-8601 timestamp" },
],
jwt: [
#{ re: #"\beyJ[a-zA-Z0-9_-]*\.[a-zA-Z0-9_-]*\.[a-zA-Z0-9_-]*\b"#, desc: "JWT token" },
],
mac: [
#{ re: #"\b(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}\b"#, desc: "MAC aa:bb:cc:dd:ee:ff" },
#{ re: #"\b(?:[0-9A-Fa-f]{4}\.){2}[0-9A-Fa-f]{4}\b"#, desc: "Cisco style MAC" },
],
md5: [
#{ re: #"(?i)\b[a-f0-9]{32}\b"#, desc: "MD5 hash" },
],
sha1: [
#{ re: #"(?i)\b[a-f0-9]{40}\b"#, desc: "SHA-1 hash" },
],
sha256: [
#{ re: #"(?i)\b[a-f0-9]{64}\b"#, desc: "SHA-256 hash" },
],
number: [
#{ re: #"[+-]?(?:\d+\.\d+|\d+\.\d*|\.\d+|\d+)(?:[eE][+-]?\d+)?"#, desc: "Integer/float literal" },
],
oauth: [
#{ re: #"\bya29\.[0-9A-Za-z_-]+\b"#, desc: "Google OAuth token" },
],
path_unix: [
#{ re: #"(?:(?:/|~)[^\s\"'<>]+)"#, desc: "/var/log/app.log" },
],
path_windows: [
#{ re: #"(?i)\b(?:[A-Z]:\\|\\\\)[^\s\"'<>]+\b"#, desc: "C:\\Windows\\System32" },
],
win_registry: [
#{ re: #"\bHKEY_[A-Z_]+(?:\\[A-Za-z0-9_]+)+\b"#, desc: "Windows registry path" },
],
sql_statement: [
#{ re: ##""(?:(?:SELECT|INSERT|UPDATE|DELETE|CREATE|ALTER|DROP|TRUNCATE|GRANT|REVOKE|MERGE)\s+(?:(?:""|[^"])+))"##, desc: "Quoted SQL statement" },
],
url: [
#{ re: #"(?i)\b(?:[a-z][a-z0-9+.-]*://[^\s\"'<>]+)"#, desc: "http://example.com/path" },
],
uuid: [
#{ re: #"(?i)\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b"#, desc: "UUID/GUID" },
],
version: [
#{ re: #"\b[vV]?\d+\.\d+(?:\.\d+)?(?:-[A-Za-z0-9]+)?\b"#, desc: "Semantic version" },
],
}
}
fn _pattern_exists(name) {
let PATTERN_DEFS = _get_pattern_defs();
for key in PATTERN_DEFS.keys() {
if key == name {
return true;
}
}
false
}
fn _pattern_defs(name) {
if !_pattern_exists(name) {
return [];
}
let PATTERN_DEFS = _get_pattern_defs();
PATTERN_DEFS[name]
}
fn _def_group(def) {
for key in def.keys() {
if key == "group" {
return def["group"];
}
}
0
}
fn pattern_names() {
let PATTERN_DEFS = _get_pattern_defs();
let names = PATTERN_DEFS.keys();
names.sort();
names
}
fn pattern_defs(name) {
_pattern_defs(name)
}
fn pattern_regexes(name) {
let defs = _pattern_defs(name);
let regexes = [];
for def in defs {
regexes.push(def.re);
}
regexes
}
fn pattern_first(name) {
let defs = _pattern_defs(name);
if defs.len() == 0 {
return "";
}
defs[0].re
}
fn has_pattern(text, name) {
let defs = _pattern_defs(name);
if defs.len() == 0 {
return false;
}
for def in defs {
let group = _def_group(def);
if group > 0 {
if text.extract_regex(def.re, group) != "" {
return true;
}
} else if text.extract_regex(def.re) != "" {
return true;
}
}
false
}
fn extract_pattern(text, name) {
let defs = _pattern_defs(name);
if defs.len() == 0 {
return "";
}
for def in defs {
let group = _def_group(def);
let found = if group > 0 {
text.extract_regex(def.re, group)
} else {
text.extract_regex(def.re)
};
if found != "" {
return found;
}
}
""
}
fn extract_patterns(text, name) {
let defs = _pattern_defs(name);
let matches = [];
for def in defs {
let group = _def_group(def);
let items = if group > 0 {
text.extract_regexes(def.re, group)
} else {
text.extract_regexes(def.re)
};
if items.len() > 0 {
for value in items {
matches.push(value);
}
}
}
matches
}
fn emit_patterns(text, name, key) {
emit_patterns_with_base(text, name, key, #{})
}
fn emit_patterns_with_base(text, name, key, base) {
let values = extract_patterns(text, name);
if values.len() == 0 {
return 0;
}
let rows = [];
for value in values {
let row = base.clone();
row[key] = value;
rows.push(row);
}
emit_each(rows)
}