awkrs 0.4.13

Awk implementation in Rust with broad CLI compatibility, parallel records, and experimental Cranelift JIT
Documentation
# Aho-Corasick multi-pattern string matching: build a goto/fail/output trie
# from a list of patterns, then scan a text linearly, emitting every match.
#
# Input lines:
#   "PAT <p>"           add pattern p to the dictionary
#   "BUILD"             freeze the dictionary and compute fail links
#   "TXT <s>"           scan s; print "TXT s -> pat@pos pat@pos ..." (1-based
#                       end-position of each match) or "TXT s -> NONE"
# Multiple PAT lines may precede BUILD; multiple TXT lines may follow.
# Trie:
#   goto[node, char] = child node id (0 means follow fail link)
#   fail[node]       = suffix-failure link
#   out[node]        = space-separated patterns ending at this node

function tr_add(p,   i, n, c, cur, nxt) {
  n = length(p)
  cur = 0
  for (i = 1; i <= n; i++) {
    c = substr(p, i, 1)
    if (!((cur SUBSEP c) in goto_)) {
      goto_[cur, c] = ++tn
      fail[tn] = 0
    }
    cur = goto_[cur, c]
  }
  out[cur] = (out[cur] == "" ? p : out[cur] " " p)
}

function tr_build(   q_head, q_tail, c, u, v, r, k) {
  delete q
  q_head = 1; q_tail = 0
  PROCINFO["sorted_in"] = "@ind_str_asc"
  for (k in goto_) {
    split(k, parts, SUBSEP)
    if (parts[1] != 0) continue
    c = parts[2]
    v = goto_[k]
    fail[v] = 0
    q_tail++; q[q_tail] = v
  }
  while (q_head <= q_tail) {
    u = q[q_head]; q_head++
    # For each child (u --c--> v): bfs and link.
    delete kids
    for (k in goto_) {
      split(k, parts, SUBSEP)
      if (parts[1] != u) continue
      kids[parts[2]] = goto_[k]
    }
    for (c in kids) {
      v = kids[c]
      r = fail[u]
      while (r != 0 && !((r SUBSEP c) in goto_)) r = fail[r]
      fail[v] = ((r SUBSEP c) in goto_ && goto_[r, c] != v) ? goto_[r, c] : 0
      if (out[fail[v]] != "") out[v] = (out[v] == "" ? out[fail[v]] : out[v] " " out[fail[v]])
      q_tail++; q[q_tail] = v
    }
  }
}

function ac_scan(s,   i, n, c, cur, hits, name, sep, nx) {
  hits = ""; sep = ""
  cur = 0
  n = length(s)
  for (i = 1; i <= n; i++) {
    c = substr(s, i, 1)
    while (cur != 0 && !((cur SUBSEP c) in goto_)) cur = fail[cur]
    if ((cur SUBSEP c) in goto_) cur = goto_[cur, c]
    if (out[cur] != "") {
      nm = out[cur]
      split(nm, names, " ")
      for (k = 1; k in names; k++) {
        hits = hits sep names[k] "@" i
        sep = " "
      }
    }
  }
  return hits == "" ? "NONE" : hits
}

$1 == "PAT"   { tr_add(substr($0, 5)); next }
$1 == "BUILD" { tr_build(); next }
$1 == "TXT"   { txt = substr($0, 5); printf "TXT %s -> %s\n", txt, ac_scan(txt); next }