import re
import urllib.request
from html.parser import HTMLParser
from pathlib import Path
class OpcodeParser(HTMLParser):
def __init__(self):
super().__init__()
self.opcodes = {}
self.current_opcode = None
self.current_desc = []
self.in_dt = False
self.in_dd = False
self.in_p = False
self.capture_text = False
def handle_starttag(self, tag, attrs):
if tag == "dt":
self.in_dt = True
self.capture_text = True
elif tag == "dd":
self.in_dd = True
elif tag == "p" and self.in_dd:
self.in_p = True
self.capture_text = True
def handle_endtag(self, tag):
if tag == "dt":
self.in_dt = False
self.capture_text = False
elif tag == "dd":
if self.current_opcode and self.current_desc:
desc = "\n\n".join(self.current_desc)
self.opcodes[self.current_opcode] = desc
self.in_dd = False
self.current_opcode = None
self.current_desc = []
elif tag == "p" and self.in_dd:
self.in_p = False
self.capture_text = False
def handle_data(self, data):
data = data.strip()
if not data:
return
if self.in_dt:
self.current_opcode = data.strip()
elif self.in_p and self.in_dd:
if self.current_desc and not self.current_desc[-1].endswith(
(".", ":", '"')
):
self.current_desc[-1] += " " + data
else:
if len(self.current_desc) > 0 and self.current_desc[-1]:
self.current_desc[-1] += " " + data
else:
self.current_desc.append(data)
def handle_entityref(self, name):
if self.capture_text:
entities = {"lt": "<", "gt": ">", "amp": "&", "quot": '"', "nbsp": " "}
char = entities.get(name, f"&{name};")
if self.in_p and self.in_dd and self.current_desc:
self.current_desc[-1] += char
def fetch_opcode_docs(local_file=None):
if local_file and Path(local_file).exists():
print(f"Reading from local file: {local_file}")
html = Path(local_file).read_text()
else:
url = "https://sqlite.org/opcode.html"
print(f"Fetching {url}...")
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
with urllib.request.urlopen(req, timeout=30) as response:
html = response.read().decode("utf-8")
opcodes = {}
pattern = r'<a name="([A-Za-z0-9_]+)"></a>\1\s*\n<td>(.*?)</td></tr>'
for match in re.finditer(pattern, html, re.DOTALL):
name = match.group(1).strip()
desc_html = match.group(2).strip()
desc_parts = []
first_p_match = re.search(r"<p>", desc_html)
if first_p_match:
before_p = desc_html[: first_p_match.start()]
before_p = re.sub(r"<[^>]+>", "", before_p)
before_p = " ".join(before_p.split())
if before_p:
desc_parts.append(before_p)
remaining = desc_html[first_p_match.start() :]
parts = re.split(r"</p>\s*(?:<p>)?", remaining)
for part in parts:
part = re.sub(r"<[^>]+>", "", part)
part = " ".join(part.split())
if part:
desc_parts.append(part)
else:
desc_html = re.sub(r"<[^>]+>", "", desc_html)
desc_html = " ".join(desc_html.split())
if desc_html:
desc_parts.append(desc_html)
full_desc = "\n\n".join(desc_parts)
full_desc = full_desc.replace("<", "<").replace(">", ">")
full_desc = full_desc.replace("&", "&").replace(""", '"')
full_desc = full_desc.replace(" ", " ")
full_desc = full_desc.replace("[", "[").replace("]", "]")
if full_desc:
opcodes[name] = full_desc
print(f"Found {len(opcodes)} opcodes")
return opcodes
def escape_brackets(text: str) -> str:
result = re.sub(r'\[([^\]]+)\]', r'\\[\1\\]', text)
return result
def format_doc_comment(desc: str, indent: str = " ") -> str:
lines = []
desc = escape_brackets(desc)
paragraphs = desc.split("\n\n")
for i, para in enumerate(paragraphs):
is_list_item = para.lstrip().startswith(("* ", "- ", "• "))
continuation_prefix = " " if is_list_item else ""
words = para.split()
current_line = ""
is_first_line = True
for word in words:
max_len = 72 if is_first_line else (72 - len(continuation_prefix))
if len(current_line) + len(word) + 1 > max_len:
lines.append(f"{indent}/// {current_line}")
current_line = continuation_prefix + word
is_first_line = False
elif current_line:
current_line = current_line + " " + word
else:
current_line = word
if current_line:
lines.append(f"{indent}/// {current_line}")
if i < len(paragraphs) - 1:
lines.append(f"{indent}///")
return "\n".join(lines)
def update_insn_rs(opcodes: dict):
insn_path = Path(__file__).parent.parent / "src" / "insn.rs"
print(f"Reading {insn_path}...")
content = insn_path.read_text()
variant_to_opcode = {
"Integer": "Integer",
"Int64": "Int64",
"Real": "Real",
"String8": "String8",
"Null": "Null",
"Add": "Add",
"Subtract": "Subtract",
"Multiply": "Multiply",
"Divide": "Divide",
"Remainder": "Remainder",
"Concat": "Concat",
"BitAnd": "BitAnd",
"BitOr": "BitOr",
"ShiftLeft": "ShiftLeft",
"ShiftRight": "ShiftRight",
"BitNot": "BitNot",
"Not": "Not",
"AddImm": "AddImm",
"Copy": "Copy",
"SCopy": "SCopy",
"Move": "Move",
"IntCopy": "IntCopy",
"Halt": "Halt",
"HaltWithError": "Halt",
"HaltIfNull": "HaltIfNull",
"Goto": "Goto",
"Gosub": "Gosub",
"Return": "Return",
"If": "If",
"IfNot": "IfNot",
"IsNull": "IsNull",
"NotNull": "NotNull",
"Once": "Once",
"Jump": "Jump",
"Eq": "Eq",
"Ne": "Ne",
"Lt": "Lt",
"Le": "Le",
"Gt": "Gt",
"Ge": "Ge",
"IfPos": "IfPos",
"IfNotZero": "IfNotZero",
"DecrJumpZero": "DecrJumpZero",
"MustBeInt": "MustBeInt",
"ResultRow": "ResultRow",
"OpenRead": "OpenRead",
"OpenWrite": "OpenWrite",
"OpenEphemeral": "OpenEphemeral",
"Close": "Close",
"Rewind": "Rewind",
"Next": "Next",
"Prev": "Prev",
"Last": "Last",
"SeekGE": "SeekGE",
"SeekGT": "SeekGT",
"SeekLE": "SeekLE",
"SeekLT": "SeekLT",
"SeekRowid": "SeekRowid",
"Column": "Column",
"Rowid": "Rowid",
"NewRowid": "NewRowid",
"Insert": "Insert",
"Delete": "Delete",
"MakeRecord": "MakeRecord",
"IdxInsert": "IdxInsert",
"IdxDelete": "IdxDelete",
"IdxRowid": "IdxRowid",
"Init": "Init",
"InitCoroutine": "InitCoroutine",
"Yield": "Yield",
"EndCoroutine": "EndCoroutine",
"AggStep": "AggStep",
"AggFinal": "AggFinal",
"Noop": "Noop",
"Explain": "Explain",
}
updates_made = 0
for variant, opcode in variant_to_opcode.items():
if opcode not in opcodes:
print(f" Warning: No documentation found for {opcode}")
continue
doc = opcodes[opcode]
doc_comment = format_doc_comment(doc)
pattern = rf"( /// [^\n]*\n(?: ///[^\n]*\n)*)? {variant}\s*\{{"
def replacer(m):
nonlocal updates_made
updates_made += 1
return f"{doc_comment}\n {variant} {{"
new_content = re.sub(pattern, replacer, content, count=1)
if new_content != content:
content = new_content
else:
pattern = rf"( /// [^\n]*\n(?: ///[^\n]*\n)*)? {variant},"
def replacer_unit(m):
nonlocal updates_made
updates_made += 1
return f"{doc_comment}\n {variant},"
new_content = re.sub(pattern, replacer_unit, content, count=1)
if new_content != content:
content = new_content
print(f"Writing {insn_path} with {updates_made} documentation updates...")
insn_path.write_text(content)
print("Done!")
def main():
import sys
local_file = sys.argv[1] if len(sys.argv) > 1 else "/tmp/opcode.html"
opcodes = fetch_opcode_docs(local_file)
print("\nExample opcodes found:")
for name in ["Add", "Integer", "Goto", "Halt"][:4]:
if name in opcodes:
print(f"\n{name}:")
print(f" {opcodes[name][:200]}...")
print("\n")
update_insn_rs(opcodes)
if __name__ == "__main__":
main()