awkrs 0.4.10 - Docs.rs

# Bigram Markov-chain analytics over a text corpus.
# 1. Tokenize each input line into lowercase a-z words.
# 2. Build bg[prev, curr] count.
# 3. For each starting word, list the top-3 follow-on words by count
#    (lexicographic tie-break) — deterministic output, no rand() needed.
# 4. Also deterministically walk a 12-step chain starting from "the":
#    at each step, pick the next word with highest count; lex tie-break.

function bigrams_for_line(line,   n, i) {
  gsub(/[^a-zA-Z]+/, " ", line)
  line = tolower(line)
  n = split(line, w, " ")
  for (i = 1; i < n; i++) {
    if (w[i] == "" || w[i+1] == "") continue
    bg[w[i], w[i+1]]++
    starts[w[i]]++
  }
}

function top_next(prev,   k, a, b, top1, top2, top3, c1, c2, c3) {
  top1 = ""; top2 = ""; top3 = ""
  c1 = c2 = c3 = -1
  for (k in bg) {
    split(k, parts, SUBSEP)
    if (parts[1] != prev) continue
    a = parts[2]; b = bg[k]
    # insert into top-3 sorted by (count desc, name asc)
    if (b > c1 || (b == c1 && a < top1)) {
      c3 = c2; top3 = top2
      c2 = c1; top2 = top1
      c1 = b;  top1 = a
    } else if (b > c2 || (b == c2 && a < top2)) {
      c3 = c2; top3 = top2
      c2 = b;  top2 = a
    } else if (b > c3 || (b == c3 && a < top3)) {
      c3 = b;  top3 = a
    }
  }
  out = top1 "(" c1 ")"
  if (top2 != "") out = out " " top2 "(" c2 ")"
  if (top3 != "") out = out " " top3 "(" c3 ")"
  return out
}

function walk(seed, steps,   path, cur, i) {
  cur = seed; path = cur
  for (i = 1; i <= steps; i++) {
    # find best next via top_next-style scan (just top1)
    best = ""; bcnt = -1
    for (k in bg) {
      split(k, parts, SUBSEP)
      if (parts[1] != cur) continue
      cv = bg[k]
      if (cv > bcnt || (cv == bcnt && parts[2] < best)) { bcnt = cv; best = parts[2] }
    }
    if (best == "") break
    path = path " " best
    cur = best
  }
  return path
}

{ bigrams_for_line($0) }

END {
  print "[top continuations]"
  PROCINFO["sorted_in"] = "@ind_str_asc"
  for (s in starts) {
    if (starts[s] < 2) continue
    printf "  %s -> %s\n", s, top_next(s)
  }
  print "[walk from \"the\"]"
  print "  " walk("the", 12)
}