// std/web — deterministic HTTP-backed source ingest and extraction helpers.
import { cache_get, cache_put } from "std/cache"
import { safe_parse } from "std/json"
fn __web_header(headers, name) {
let wanted = lowercase(name)
for entry in headers ?? {} {
if lowercase(entry.key) == wanted {
return entry.value
}
}
return nil
}
fn __web_store_options(opts) {
if opts?.store == nil {
return nil
}
return {store: opts.store}
}
fn __web_cache_key(method, url, opts) {
return opts?.cache_key ?? ("std/web:web_fetch:" + uppercase(method) + ":" + url)
}
fn __web_effective_url(url, http_opts) {
let query = http_opts?.query
if query == nil {
return url
}
let parsed = url_parse(url)
var pairs = query_parse(parsed?.query ?? "")
for entry in query {
if entry.value != nil {
pairs = pairs + [{key: entry.key, value: to_string(entry.value)}]
}
}
return url_build(
{
scheme: parsed.scheme,
host: parsed.host,
port: parsed.port,
path: parsed.path ?? "/",
query: query_stringify(pairs),
fragment: parsed.fragment,
username: parsed.username,
password: parsed.password,
},
)
}
fn __web_base_headers(opts) {
let http_headers = opts?.http_options?.headers ?? {}
let headers = opts?.headers
if headers == nil {
return http_headers
}
return http_headers + headers
}
fn __web_conditional_headers(headers, previous, enabled) {
if !enabled || previous == nil {
return headers
}
var next = headers
if __web_header(next, "if-none-match") == nil && previous?.etag != nil {
next = next + {"If-None-Match": previous.etag}
}
if __web_header(next, "if-modified-since") == nil && previous?.last_modified != nil {
next = next + {"If-Modified-Since": previous.last_modified}
}
return next
}
fn __web_fetch_envelope(source_url, response, fetched_at, cache_status) {
let headers = response?.headers ?? {}
return {
ok: response?.ok ?? false,
status: response?.status,
body: response?.body ?? "",
headers: headers,
content_type: __web_header(headers, "content-type"),
etag: __web_header(headers, "etag"),
last_modified: __web_header(headers, "last-modified"),
fetched_at: fetched_at,
cache_status: cache_status,
source_url: source_url,
final_url: response?.final_url ?? source_url,
not_modified: false,
}
}
fn __web_not_modified(source_url, response, previous, fetched_at) {
let headers = response?.headers ?? {}
var next = previous
next = next
+ {
ok: true,
status: 304,
headers: headers,
fetched_at: fetched_at,
cache_status: "not_modified",
source_url: source_url,
final_url: response?.final_url ?? previous?.final_url ?? source_url,
not_modified: true,
}
let etag = __web_header(headers, "etag")
if etag != nil {
next = next + {etag: etag}
}
let last_modified = __web_header(headers, "last-modified")
if last_modified != nil {
next = next + {last_modified: last_modified}
}
let content_type = __web_header(headers, "content-type")
if content_type != nil {
next = next + {content_type: content_type}
}
return next
}
/**
* Fetch a web source through Harn's HTTP stack and normalize provenance.
*
* Options:
* - method, headers, query, body, timeout, retry, session, proxy, tls:
* forwarded to `http_request`.
* - http_options: explicit HTTP option dict when callers want web-specific
* options kept separate.
* - previous: previous `web_fetch` envelope used for conditional headers.
* - store: `std/cache` store. When present, cached ETag / Last-Modified
* values are used for conditional re-fetches and 304 responses reuse the
* cached body.
* - cache_key: override for the cache key.
* - conditional: set false to skip If-None-Match / If-Modified-Since.
* - fetched_at: deterministic timestamp override for fixtures.
*
* @effects: [net, time]
* @allocation: heap
* @errors: []
* @api_stability: stable
* @example: web_fetch(url, options)
*/
pub fn web_fetch(url: string, options = nil) -> dict {
let opts = options ?? {}
let method = uppercase(opts?.method ?? opts?.http_options?.method ?? "GET")
var http_opts = opts?.http_options ?? opts
let source_url = __web_effective_url(url, http_opts)
let store_opts = __web_store_options(opts)
let cache_key = __web_cache_key(method, source_url, opts)
var previous = opts?.previous
var cache_status = if store_opts == nil {
"bypass"
} else {
"miss"
}
if previous == nil && store_opts != nil {
let cached = cache_get(cache_key, store_opts)
if cached.hit {
previous = cached.value
cache_status = "refresh"
}
}
let headers = __web_conditional_headers(__web_base_headers(opts), previous, opts?.conditional ?? true)
http_opts = http_opts + {headers: headers}
let response = harness.net.request(method, url, http_opts)
let fetched_at = opts?.fetched_at ?? harness.clock.timestamp()
if response.status == 304 && previous != nil {
let not_modified = __web_not_modified(source_url, response, previous, fetched_at)
if store_opts != nil {
cache_put(cache_key, not_modified, store_opts)
}
return not_modified
}
let envelope = __web_fetch_envelope(source_url, response, fetched_at, cache_status)
if store_opts != nil && envelope.ok {
cache_put(cache_key, envelope, store_opts)
}
return envelope
}
fn __web_search_tokens(query) {
let normalized = regex_replace("[^A-Za-z0-9_./@:-]+", " ", lowercase(query ?? ""))
var out = []
for token in split(trim(normalized), " ") {
if token != "" && len(token) > 1 && !out.contains(token) {
out = out.push(token)
}
}
return out
}
fn __web_search_text(entry) {
return lowercase(
to_string(entry?.title ?? "")
+ " "
+ to_string(entry?.name ?? "")
+ " "
+ to_string(entry?.url ?? entry?.href ?? entry?.source_url ?? "")
+ " "
+ to_string(entry?.snippet ?? entry?.description ?? entry?.text ?? "")
+ " "
+ join(entry?.tags ?? [], " ")
+ " "
+ to_string(entry?.package ?? ""),
)
}
fn __web_search_score(entry, query, tokens) {
let haystack = __web_search_text(entry)
var score = 0
let phrase = lowercase(trim(query ?? ""))
if phrase != "" && contains(haystack, phrase) {
score = score + 25
}
let title = lowercase(to_string(entry?.title ?? entry?.name ?? ""))
let url = lowercase(to_string(entry?.url ?? entry?.href ?? entry?.source_url ?? ""))
for token in tokens {
if contains(title, token) {
score = score + 8
} else if contains(url, token) {
score = score + 5
} else if contains(haystack, token) {
score = score + 2
}
}
let source_type = entry?.source_type ?? entry?.kind
if ["docs", "pinned_docs", "package_registry", "registry"].contains(source_type) {
score = score + 6
}
if entry?.authority || entry?.authoritative {
score = score + 8
}
if entry?.version != nil || entry?.pinned_version != nil {
score = score + 2
}
return score
}
fn __web_search_take(items, limit) {
var out = []
for item in items {
if len(out) >= limit {
return out
}
out = out.push(item)
}
return out
}
fn __web_search_path_get(value, path, fallback = nil) {
if path == nil {
return fallback
}
let parts = if type_of(path) == "list" {
path
} else {
split(to_string(path), ".").filter({ part -> part != "" })
}
var current = value
for part in parts {
if current == nil {
return fallback
}
current = current[part]
}
return current ?? fallback
}
fn __web_search_merge_headers(left, right) {
let merged = left ?? {}
let extra = right ?? {}
return merged + extra
}
fn __web_search_tool_options(base, args) {
let merged = base ?? {}
let extra = args?.options ?? args ?? {}
return merged + extra
}
fn __web_search_backend(opts) {
let explicit = opts?.backend
if type_of(explicit) == "dict" {
return explicit
+ {
kind: explicit?.kind ?? explicit?.type ?? "curated_index",
id: explicit?.id ?? explicit?.name ?? explicit?.kind ?? "configured",
deterministic: true,
}
}
if type_of(explicit) == "string" {
return {kind: explicit, id: explicit, deterministic: true}
}
if opts?.provider_results != nil {
return {kind: "provider_hosted", id: "provider_results", deterministic: true, evidence: "provider_results"}
}
if opts?.api != nil {
let api = opts.api
return {
kind: "api",
id: api?.id ?? api?.name ?? api?.url ?? "configured_search_api",
deterministic: true,
evidence: "search_api",
api: api,
}
}
let env_url = harness.env.get("HARN_WEB_SEARCH_URL")
if env_url != nil && env_url != "" {
var headers = {}
let bearer = harness.env.get("HARN_WEB_SEARCH_BEARER_TOKEN")
if bearer != nil && bearer != "" {
headers = {Authorization: "Bearer " + bearer}
}
return {
kind: "api",
id: "env:HARN_WEB_SEARCH_URL",
deterministic: true,
evidence: "search_api",
api: {
url: env_url,
query_param: harness.env.get_or("HARN_WEB_SEARCH_QUERY_PARAM", "q"),
headers: headers,
},
}
}
return {
kind: "curated_index",
id: opts?.index_id ?? "inline",
deterministic: true,
evidence: "curated_index",
}
}
fn __web_search_public_backend(backend) {
var public = {
kind: backend.kind,
id: backend.id,
deterministic: backend?.deterministic ?? true,
evidence: backend?.evidence ?? backend.kind,
}
if backend?.source_url != nil {
public = public + {source_url: backend.source_url}
}
if backend?.final_url != nil {
public = public + {final_url: backend.final_url}
}
return public
}
fn __web_search_normalize(entry, rank, backend, score = nil) {
let url = entry?.url ?? entry?.href ?? entry?.source_url ?? entry?.docs_url
return {
rank: rank,
title: to_string(entry?.title ?? entry?.name ?? url ?? ""),
url: url,
snippet: to_string(entry?.snippet ?? entry?.description ?? entry?.text ?? ""),
source_url: entry?.source_url ?? url,
final_url: entry?.final_url ?? url,
source_type: entry?.source_type ?? entry?.kind ?? "web",
authority: entry?.authority ?? entry?.authoritative ?? false,
package: entry?.package ?? entry?.name,
registry: entry?.registry ?? entry?.ecosystem,
version: entry?.version ?? entry?.pinned_version,
provenance: {
backend_kind: backend.kind,
backend_id: backend.id,
evidence: backend?.evidence ?? backend.kind,
score: score,
source_url: entry?.source_url ?? url,
trust_score: entry?.trust_score,
first_seen: entry?.first_seen,
fetched_at: backend?.fetched_at,
},
}
}
fn __web_search_curated(query, opts, backend) {
let tokens = __web_search_tokens(query)
let limit = opts?.limit ?? 10
var scored = []
for entry in opts?.index ?? opts?.results ?? backend?.index ?? [] {
let score = __web_search_score(entry, query, tokens)
if score > 0 || opts?.include_zero_score {
scored = scored.push({entry: entry, score: score})
}
}
let ordered = scored.sort_by({ item -> 0 - item.score })
var results = []
var rank = 1
for item in __web_search_take(ordered, limit) {
results = results.push(__web_search_normalize(item.entry, rank, backend, item.score))
rank = rank + 1
}
return {
ok: true,
query: query,
backend: __web_search_public_backend(backend),
results: results,
provenance: {resolver: "std/web", result_count: len(results), authoritative_first: true},
}
}
fn __web_search_provider_results(query, opts, backend) {
var results = []
var rank = 1
for entry in __web_search_take(opts.provider_results ?? [], opts?.limit ?? 10) {
results = results.push(__web_search_normalize(entry, rank, backend))
rank = rank + 1
}
return {
ok: true,
query: query,
backend: __web_search_public_backend(backend),
results: results,
provenance: {resolver: "std/web", result_count: len(results), authoritative_first: false},
}
}
fn __web_search_api_item(item, api) {
if type_of(item) != "dict" {
return {title: to_string(item), snippet: to_string(item)}
}
return item
+ {
title: __web_search_path_get(item, api?.title_path ?? "title", item?.title ?? item?.name),
url: __web_search_path_get(item, api?.url_path ?? "url", item?.url ?? item?.href),
snippet: __web_search_path_get(
item,
api?.snippet_path ?? "snippet",
item?.snippet ?? item?.description ?? item?.text,
),
source_url: __web_search_path_get(item, api?.source_url_path ?? "source_url", item?.source_url),
}
}
fn __web_search_api(query, opts, backend) {
let api = backend.api ?? opts.api
let limit = opts?.limit ?? api?.limit ?? 10
var fetch_options = api?.fetch_options ?? {}
let method = uppercase(api?.method ?? fetch_options?.method ?? "GET")
if method == "GET" {
let query_param = api?.query_param ?? "q"
let existing_query = fetch_options?.query ?? {}
fetch_options = fetch_options + {method: method, query: existing_query + {[query_param]: query}}
} else {
let headers = __web_search_merge_headers(fetch_options?.headers, api?.headers)
fetch_options = fetch_options
+ {method: method, headers: headers, body: api?.body ?? json_stringify({query: query, limit: limit})}
}
if api?.headers != nil && method == "GET" {
fetch_options = fetch_options + {headers: __web_search_merge_headers(fetch_options?.headers, api.headers)}
}
let fetched = web_fetch(api.url, fetch_options)
let parsed = safe_parse(fetched.body)
let items = __web_search_path_get(parsed, api?.results_path ?? "results", [])
let raw_results = if type_of(items) == "list" {
items
} else {
[]
}
var results = []
var rank = 1
for item in __web_search_take(raw_results, limit) {
let normalized = __web_search_api_item(item, api)
results = results
.push(
__web_search_normalize(
normalized,
rank,
backend + {fetched_at: fetched.fetched_at},
normalized?.score,
),
)
rank = rank + 1
}
let fetch_ok = fetched.ok ?? false
let parsed_ok = parsed != nil
return {
ok: fetch_ok && parsed_ok,
query: query,
backend: __web_search_public_backend(
backend + {source_url: fetched.source_url, final_url: fetched.final_url},
),
results: results,
provenance: {
resolver: "std/web",
result_count: len(results),
authoritative_first: false,
fetched_at: fetched.fetched_at,
cache_status: fetched.cache_status,
source_url: fetched.source_url,
final_url: fetched.final_url,
},
}
}
/**
* Search authoritative web, docs, or registry evidence through a normalized
* backend contract.
*
* Backends:
* - `index` / `results`: deterministic curated entries scored locally.
* - `api`: configured JSON search API fetched through `web_fetch`.
* - `provider_results`: externally captured hosted-tool results normalized
* without coupling this primitive to any provider.
* - `HARN_WEB_SEARCH_URL`: optional process configuration for an API backend.
*
* @effects: [net, time]
* @allocation: heap
* @errors: []
* @api_stability: experimental
* @example: web_search(query, options)
*/
pub fn web_search(query: string, options = nil) -> dict {
let opts = options ?? {}
let backend = __web_search_backend(opts)
if backend.kind == "api" {
return __web_search_api(query, opts, backend)
}
if backend.kind == "provider_hosted" {
return __web_search_provider_results(query, opts, backend)
}
return __web_search_curated(query, opts, backend)
}
/**
* Deterministically verify package imports and optional symbol evidence for
* source files. Options include `project_root`, `installed_packages`,
* `registry`, `now`, `low_trust_threshold`, and `min_package_age_days`.
*
* @effects: [fs]
* @allocation: heap
* @errors: []
* @api_stability: experimental
* @example: verify_imports(paths, options)
*/
pub fn verify_imports(paths, options = nil) -> dict {
return __verify_imports(paths, options ?? {})
}
/**
* Add model-callable `web_search` and `verify_imports` tools with
* capability-gated grounding guidance.
*
* @effects: []
* @allocation: heap
* @errors: []
* @api_stability: experimental
* @example: web_grounding_tools(registry, options)
*/
pub fn web_grounding_tools(registry = nil, options = nil) {
let opts = options ?? {}
var tools = registry ?? tool_registry()
tools = tool_define(
tools,
opts?.search_tool_name ?? "web_search",
"Search authoritative docs, package registries, or a configured search API and return normalized results with provenance.",
{
parameters: {
type: "object",
required: ["query"],
properties: {query: {type: "string"}, limit: {type: "integer"}, index: {type: "array"}, api: {type: "object"}},
},
annotations: {readOnlyHint: true},
handler: { args -> web_search(args.query, __web_search_tool_options(opts, args)) },
guidance: "For unfamiliar APIs, package names, or recent behavior, call web_search against authoritative docs or registries before coding. Prefer pinned docs and registry evidence over general web snippets, and carry source URLs into the answer or follow-up prompt.",
},
)
tools = tool_define(
tools,
opts?.verify_tool_name ?? "verify_imports",
"Verify imports in source files against manifests, installed-package hints, registry evidence, symbol metadata, and package trust signals.",
{
parameters: {
type: "object",
required: ["paths"],
properties: {
paths: {type: "array", items: {type: "string"}},
project_root: {type: "string"},
registry: {type: "array"},
installed_packages: {},
},
},
annotations: {readOnlyHint: true},
handler: { args -> verify_imports(args.paths ?? [args.path], __web_search_tool_options(opts, args)) },
guidance: "After generating or editing code with imports, call verify_imports on the touched source files. Treat package_not_found and symbol_not_found as blockers. Treat fresh_package or low_trust_package warnings as a forced lookup trigger before proceeding.",
},
)
return tools
}
/**
* Resolve a URL reference against a fetched source URL.
*
* @effects: []
* @allocation: heap
* @errors: []
* @api_stability: stable
* @example: web_resolve_url(base_url, href)
*/
pub fn web_resolve_url(base_url: string, href: string) {
return __web_resolve_url(base_url, href)
}
/**
* Return the origin URL with `path`, dropping query and fragment.
*
* @effects: []
* @allocation: heap
* @errors: []
* @api_stability: stable
* @example: web_origin_url(url, path)
*/
pub fn web_origin_url(url: string, path: string = "/") -> string {
return __web_origin_url(url, path)
}
/**
* Parse title, meta, canonical URL, links, tables, JSON-LD, and plain text.
*
* @effects: []
* @allocation: heap
* @errors: []
* @api_stability: stable
* @example: web_parse_html(html, source_url)
*/
pub fn web_parse_html(html: string, source_url = nil) -> dict {
return __web_extract_html(html, source_url)
}
/**
* Extract the HTML title.
*
* @effects: []
* @allocation: heap
* @errors: []
* @api_stability: stable
* @example: html_title(html)
*/
pub fn html_title(html: string) {
return web_parse_html(html).title
}
/**
* Extract normalized HTML meta tags keyed by lower-case name/property.
*
* @effects: []
* @allocation: heap
* @errors: []
* @api_stability: stable
* @example: html_meta(html)
*/
pub fn html_meta(html: string) {
return web_parse_html(html).meta
}
/**
* Extract resolved links from anchors.
*
* @effects: []
* @allocation: heap
* @errors: []
* @api_stability: stable
* @example: html_links(html, source_url)
*/
pub fn html_links(html: string, source_url = nil) {
return web_parse_html(html, source_url).links
}
/**
* Extract table captions, headers, and rows.
*
* @effects: []
* @allocation: heap
* @errors: []
* @api_stability: stable
* @example: html_tables(html)
*/
pub fn html_tables(html: string) {
return web_parse_html(html).tables
}
/**
* Extract parsed JSON-LD script blocks.
*
* @effects: []
* @allocation: heap
* @errors: []
* @api_stability: stable
* @example: html_json_ld(html)
*/
pub fn html_json_ld(html: string) {
return web_parse_html(html).json_ld
}
/**
* Extract normalized visible text without script/style/noscript bodies.
*
* @effects: []
* @allocation: heap
* @errors: []
* @api_stability: stable
* @example: html_text(html)
*/
pub fn html_text(html: string) {
return web_parse_html(html).text
}
fn __web_robots_url(url) {
return web_origin_url(url, "/robots.txt")
}
fn __web_line_value(line) {
let parts = split(line, ":")
if len(parts) < 2 {
return nil
}
return {field: lowercase(trim(parts[0])), value: trim(join(parts[1:], ":"))}
}
fn __web_robots_rows(body) {
var rows = []
for line in split(body ?? "", "\n") {
let clean = trim(split(line, "#")[0])
if clean != "" {
let row = __web_line_value(clean)
if row != nil {
rows = rows + [row]
}
}
}
return rows
}
fn __web_robots_group_score(agents, target) {
var score = 0
for agent in agents {
let normalized = lowercase(trim(agent))
if normalized == target {
score = 2
} else if normalized == "*" && score < 1 {
score = 1
}
}
return score
}
fn __web_robots_push_group(groups, agents, rules, target) {
if len(agents) == 0 || len(rules) == 0 {
return groups
}
let score = __web_robots_group_score(agents, target)
if score <= 0 {
return groups
}
return groups + [{score: score, rules: rules}]
}
fn __web_robots_rules(body, user_agent) {
let target = lowercase(trim(user_agent ?? "*"))
var groups = []
var agents = []
var group_rules = []
for row in __web_robots_rows(body) {
if row.field == "user-agent" {
if len(group_rules) > 0 {
groups = __web_robots_push_group(groups, agents, group_rules, target)
agents = []
group_rules = []
}
agents = agents + [row.value]
} else if row.field == "allow" || row.field == "disallow" {
group_rules = group_rules + [{kind: row.field, path: row.value}]
}
}
groups = __web_robots_push_group(groups, agents, group_rules, target)
var best_score = 0
for group in groups {
if group.score > best_score {
best_score = group.score
}
}
var rules = []
for group in groups {
if group.score == best_score {
rules = rules + group.rules
}
}
return rules
}
fn __web_path_for_robots(url) {
let parsed = url_parse(url)
let query = parsed?.query
if query == nil || query == "" {
return parsed.path ?? "/"
}
let path = parsed.path ?? "/"
return path + "?" + query
}
/**
* Return whether robots.txt permits `user_agent` to fetch `url`.
*
* Missing or non-2xx robots files allow by default. Matching uses the
* deterministic subset recurring CI workflows need: exact or `*` user-agent
* groups, Allow/Disallow path prefixes, and longest-prefix precedence.
*
* @effects: [net]
* @allocation: stack-only
* @errors: []
* @api_stability: stable
* @example: robots_allowed(url, user_agent, options)
*/
pub fn robots_allowed(url: string, user_agent: string = "*", options = nil) -> bool {
let opts = options ?? {}
let robots_url = opts?.robots_url ?? __web_robots_url(url)
let response = web_fetch(robots_url, opts?.fetch_options ?? {})
if !(response.ok ?? false) {
return true
}
let path = __web_path_for_robots(url)
var best_len = -1
var allowed = true
for rule in __web_robots_rules(response.body, user_agent) {
if rule.path == "" {
continue
}
if starts_with(path, rule.path) {
let score = len(rule.path)
if score > best_len || (score == best_len && rule.kind == "allow") {
best_len = score
allowed = rule.kind == "allow"
}
}
}
return allowed
}
fn __web_xml_unescape(text) {
var out = regex_replace("&", "&", text)
out = regex_replace("<", "<", out)
out = regex_replace(">", ">", out)
out = regex_replace(""", "\"", out)
out = regex_replace("'", "'", out)
return out
}
fn __web_sitemap_locs(xml) {
var urls = []
for capture in regex_captures("(?is)<loc\\b[^>]*>(.*?)</loc>", xml ?? "") {
let loc = trim(__web_xml_unescape(capture.groups[0] ?? ""))
if loc != "" && !urls.contains(loc) {
urls = urls + [loc]
}
}
return urls
}
fn __web_sitemap_candidates(base_url, opts) {
if opts?.sitemap_urls != nil {
return opts.sitemap_urls
}
var urls = []
let robots = web_fetch(opts?.robots_url ?? web_origin_url(base_url, "/robots.txt"), opts?.fetch_options ?? {})
if robots.ok {
for row in __web_robots_rows(robots.body) {
if row.field == "sitemap" && row.value != "" && !urls.contains(row.value) {
urls = urls + [row.value]
}
}
}
if len(urls) == 0 {
urls = [web_origin_url(base_url, "/sitemap.xml")]
}
return urls
}
/**
* Discover URLs from robots-advertised sitemaps or `/sitemap.xml`.
*
* Options: sitemap_urls, robots_url, fetch_options, max_sitemaps.
*
* @effects: [net]
* @allocation: heap
* @errors: []
* @api_stability: stable
* @example: sitemap_urls(base_url, options)
*/
pub fn sitemap_urls(base_url: string, options = nil) -> list<string> {
let opts = options ?? {}
let max_sitemaps = opts?.max_sitemaps ?? 10
var found = []
var count = 0
for sitemap_url in __web_sitemap_candidates(base_url, opts) {
if count >= max_sitemaps {
return found
}
count = count + 1
let response = web_fetch(sitemap_url, opts?.fetch_options ?? {})
if response.ok {
for loc in __web_sitemap_locs(response.body) {
if !found.contains(loc) {
found = found + [loc]
}
}
}
}
return found
}