iocaine 3.0.0

The deadliest poison known to AI
Documentation
-- SPDX-FileCopyrightText: 2025 Gergely Nagy
-- SPDX-FileContributor: Gergely Nagy
--
-- SPDX-License-Identifier: MIT

function apply_default_config()
   if iocaine.config.minify == nil then
      iocaine.config.minify = true
   end

   if iocaine.config.garbage == nil then
      iocaine.config.garbage = {}
   end

   if iocaine.config.garbage.title == nil then
      iocaine.config.garbage.title = {}
   end
   if iocaine.config.garbage.title["min-words"] == nil then
      iocaine.config.garbage.title["min-words"] = 2
   end
   if iocaine.config.garbage.title["max-words"] == nil then
      iocaine.config.garbage.title["max-words"] = 15
   end

   if iocaine.config.garbage.paragraphs == nil then
      iocaine.config.garbage.paragraphs = {}
   end
   if iocaine.config.garbage.paragraphs["min-count"] == nil then
      iocaine.config.garbage.paragraphs["min-count"] = 1
   end
   if iocaine.config.garbage.paragraphs["max-count"] == nil then
      iocaine.config.garbage.paragraphs["max-count"] = 5
   end
   if iocaine.config.garbage.paragraphs["min-words"] == nil then
      iocaine.config.garbage.paragraphs["min-words"] = 10
   end
   if iocaine.config.garbage.paragraphs["max-words"] == nil then
      iocaine.config.garbage.paragraphs["max-words"] = 69
   end

   if iocaine.config.garbage.links == nil then
      iocaine.config.garbage.links = {}
   end
   if iocaine.config.garbage.links["min-count"] == nil then
      iocaine.config.garbage.links["min-count"] = 1
   end
   if iocaine.config.garbage.links["max-count"] == nil then
      iocaine.config.garbage.links["max-count"] = 8
   end
   if iocaine.config.garbage.links["min-uri-parts"] == nil then
      iocaine.config.garbage.links["min-uri-parts"] = 1
   end
   if iocaine.config.garbage.links["max-uri-parts"] == nil then
      iocaine.config.garbage.links["max-uri-parts"] = 2
   end
   if iocaine.config.garbage.links["min-text-words"] == nil then
      iocaine.config.garbage.links["min-text-words"] = 2
   end
   if iocaine.config.garbage.links["max-text-words"] == nil then
      iocaine.config.garbage.links["max-text-words"] = 5
   end
   if iocaine.config.garbage.links["uri-separator"] == nil then
      iocaine.config.garbage.links["uri-separator"] = "-"
   end
end

function init_metrics()
   iocaine.log.debug("Registering metrics")
   local qmk_requests = iocaine.metrics.registry:new_counter(
      "qmk_requests", "Number of requests received", "host"
   )
   iocaine.metrics.loaded:update(qmk_requests)

   local qmk_ruleset_hits = iocaine.metrics.registry:new_counter(
      "qmk_ruleset_hits", "Number of times a ruleset has been hit",
      "ruleset", "outcome"
   )
   iocaine.metrics.loaded:update(qmk_ruleset_hits)

   local qmk_garbage_generated = iocaine.metrics.registry:new_counter(
      "qmk_garbage_generated", "Amount of garbage generated, in bytes",
      "host"
   )
   iocaine.metrics.loaded:update(qmk_garbage_generated)

   _G.METRIC_REQUESTS = qmk_requests
   _G.METRIC_RULESET_HITS = qmk_ruleset_hits
   _G.METRIC_GARBAGE_GENERATED = qmk_garbage_generated
end

function init_check_ai_robots_txt()
   local path = iocaine.config["ai-robots-txt-path"]
   local data = {}
   if not path then
      iocaine.log.warn("No ai-robots-txt-path configured, using default")
      data = iocaine.serde.parse_json(iocaine.file.read_embedded("/defaults/etc/robots.json"))
   else
      iocaine.log.debug(string.format("Loading ai-robots-txt from %s", path))
      data = iocaine.file.read_as_json(path)
   end

   local keys = {}
   for k, _ in pairs(data) do
      table.insert(keys, k)
   end

   _G.AI_ROBOTS_TXT = iocaine.matcher.Patterns(table.unpack(keys))
end

function init_check_major_browsers()
   _G.MAJOR_BROWSERS = iocaine.matcher.Patterns("Chrome/", "Firefox")
end

function init_check_unwanted_visitors()
   local unwanted = iocaine.config["unwanted-visitors"]
   if unwanted == nil then
      unwanted = {"Perplexity", }
   end
   _G.UNWANTED_VISITORS = iocaine.matcher.Patterns(table.unpack(unwanted))
end

function init_sources()
   local sources = iocaine.config.sources
   if not sources then
      _G.MARKOV = iocaine.generator.Markov()
      _G.WORDLIST = iocaine.generator.WordList()
      return
   end

   local corpus_sources = sources["training-corpus"]
   if corpus_sources then
      if type(corpus_sources) == "table" then
         _G.MARKOV = iocaine.generator.Markov(table.unpack(corpus_sources))
      else
         _G.MARKOV = iocaine.generator.Markov(corpus_sources)
      end
   else
      _G.MARKOV = iocaine.generator.Markov()
   end

   local wordlists = sources.wordlists
   if wordlists then
      if type(wordlists) == "table" then
         _G.WORDLIST = iocaine.generator.WordList(table.unpack(wordlists))
      else
         _G.WORDLIST = iocaine.generator.WordList(wordlists)
      end
   else
      _G.WORDLIST = iocaine.generator.WordList()
   end
end

function init_template()
   local template
   if iocaine.config.template then
      iocaine.log.debug("HTML template loaded from configuration")
      template = iocaine.config.template
   elseif iocaine.config["template-file"] then
      iocaine.log.debug(string.format("Loading HTML template from %s", iocaine.config["template-file"]))
      template = iocaine.file.read_as_string(iocaine.config["template-file"])
   else
      iocaine.log.debug("Loading embedded HTML template")
      template = iocaine.file.read_embedded("/defaults/templates/garbage.html")
   end

   iocaine.log.debug("Initializing template engine")
   _G.ENGINE = iocaine.TemplateEngine()
   _G.TEMPLATE_HTML = ENGINE:compile(template)
end

function init_logging()
   local logging_enabled = false
   if iocaine.config["logging"] then
      logging_enabled = true;
   end
   _G.LOGGING_ENABLED = logging_enabled
end

function init()
   apply_default_config()
   init_metrics()
   init_check_ai_robots_txt()
   init_check_major_browsers()
   init_check_unwanted_visitors()
   init_sources()

   init_template()
   init_logging()
end

return init