import json
import re
import sys
def slugify(term):
return re.sub(r"[^\w\s-]", "", term.lower()).strip().replace(" ", "-")
def parse_glossary(content):
local_terms = {}
external_terms = {}
for m in re.finditer(r"(?m)^([^\s#<>!`\[].+)\n : ", content):
term = m.group(1).strip()
local_terms[term] = f"glossary.md#{slugify(term)}"
link_block = re.search(
r"<!--\s*external-glossary-links\s*\n(.*?)-->", content, re.DOTALL
)
if not link_block:
raise ValueError("Glossary is missing <!-- external-glossary-links --> block")
for line in link_block.group(1).strip().splitlines():
line = line.strip()
if not line:
continue
parts = re.split(r"\s*[:|]\s*", line, maxsplit=1)
if len(parts) == 2:
term, url = parts[0].strip(), parts[1].strip()
if term and url:
external_terms[term] = url
return local_terms, external_terms
_SKIP_PATTERN = re.compile(
r"^```[^\n]*\n[\s\S]*?^```\s*$" r"|`[^`\n]+`" r"|\[[^\]]*\]\([^\)]*\)" r"|<[^>\n]+>" r"|^\s*>.*$" r"|^.+(?=\n : )" r"|^#{1,6}\s+.*$", re.MULTILINE,
)
def link_terms_in_content(content, terms, first_only=True, url_prefix=""):
linked = set()
sorted_terms = sorted(terms.keys(), key=len, reverse=True)
if not sorted_terms:
return content
escaped = [re.escape(t) + r"s?" for t in sorted_terms]
term_pattern = re.compile(r"\b(" + "|".join(escaped) + r")\b")
protected = []
for m in _SKIP_PATTERN.finditer(content):
protected.append((m.start(), m.end()))
_suppressed_re = re.compile(r"<!--\s*no-glossary-link:([\w\s-]+?)\s*-->")
suppressed_at_line = {}
for m in _suppressed_re.finditer(content):
term_name = m.group(1).strip().lower()
next_line_start = content.find("\n", m.end())
if next_line_start == -1:
continue
next_line_start += 1
next_line_end = content.find("\n", next_line_start)
if next_line_end == -1:
next_line_end = len(content)
suppressed_at_line.setdefault((next_line_start, next_line_end), set()).add(
term_name
)
def in_protected(start, end):
for ps, pe in protected:
if start >= ps and end <= pe:
return True
return False
def is_suppressed(canonical, start):
for (ls, le), suppressed_terms in suppressed_at_line.items():
if ls <= start < le and canonical.lower() in suppressed_terms:
return True
return False
result = []
last = 0
for m in term_pattern.finditer(content):
matched_term = m.group(1)
canonical = None
for candidate in (matched_term, matched_term.rstrip("s")):
if candidate in terms:
canonical = candidate
break
for t in terms:
if t.lower() == candidate.lower():
canonical = t
break
if canonical is not None:
break
if canonical is None:
continue
if first_only and canonical in linked:
continue
if in_protected(m.start(), m.end()):
continue
if is_suppressed(canonical, m.start()):
continue
linked.add(canonical)
result.append(content[last : m.start()])
url = terms[canonical]
if url_prefix and not url.startswith("http"):
url = url_prefix + url
result.append(f"[{matched_term}]({url})")
last = m.end()
result.append(content[last:])
return "".join(result)
def find_glossary_content(items):
for item in items:
if not isinstance(item, dict) or "Chapter" not in item:
continue
ch = item["Chapter"]
if ch.get("path") and ch["path"].endswith("glossary.md"):
return ch["content"]
result = find_glossary_content(ch.get("sub_items", []))
if result is not None:
return result
return None
def process_item(item, local_terms, external_terms):
if not isinstance(item, dict) or "Chapter" not in item:
return
ch = item["Chapter"]
path = ch.get("path", "")
if path and path.endswith("glossary.md"):
ch["content"] = link_terms_in_content(
ch["content"], external_terms, first_only=False
)
elif path:
prefix = "../" * path.count("/")
all_terms = {**local_terms, **external_terms}
ch["content"] = link_terms_in_content(
ch["content"], all_terms, url_prefix=prefix
)
for sub in ch.get("sub_items", []):
process_item(sub, local_terms, external_terms)
def main():
for line in sys.stdin:
if not line.strip():
continue
[context, book] = json.loads(line)
glossary_content = find_glossary_content(book["items"])
if glossary_content is None:
json.dump(book, fp=sys.stdout)
return
local_terms, external_terms = parse_glossary(glossary_content)
for item in book["items"]:
process_item(item, local_terms, external_terms)
json.dump(book, fp=sys.stdout)
return
if __name__ == "__main__":
main()