from pathlib import Path
import urllib.parse
import re
import aiohttp
from bs4 import BeautifulSoup
CACHE_DIR = Path("cache")
GUROBI_REF_MAN_URL = urllib.parse.urlparse('https://www.gurobi.com/documentation/11.0/refman/')
DOC_REMOVE = [
r"For examples of how to query or modify parameter values from our different APIs, refer to our Parameter Examples.",
r"For examples of how to query or modify attributes, refer to our Attribute Examples.",
r"Next:.+Up:.+Previous.+",
r"One important note about integer-valued parameters: while the maximum value that can be stored in a signed integer is.+",
r"Please refer to this section for more information on SOS constraints.",
"next",
"up",
"previous",
]
DOC_REMOVE = [re.compile(r) for r in DOC_REMOVE]
def remove_newlines(s: str) -> str:
return ' '.join(filter(bool, s.splitlines()))
def get_url(path: str):
url = GUROBI_REF_MAN_URL._replace(path=GUROBI_REF_MAN_URL.path + path).geturl()
return url
async def fetch_html(session: aiohttp.ClientSession, path: str):
cache_path = CACHE_DIR / path
if cache_path.exists():
with open(cache_path, 'r') as fp:
return fp.read()
url = get_url(path)
async with session.get(url) as res:
print("GET", url, res.status)
if res.status != 200:
raise Exception("bad request")
text = await res.text()
cache_path.parent.mkdir(parents=True, exist_ok=True)
with open(cache_path, 'w') as fp:
fp.write(text)
return text
def _clean_documentation(s: str) -> list:
paragraphs = map(lambda x: remove_newlines(x.strip()), filter(bool, s.split('\n\n')))
paragraphs = [
p for p in paragraphs if not any(r.fullmatch(p) for r in DOC_REMOVE)
]
return paragraphs
_DTYPES = {
"double": "dbl",
"string": "str",
"int": "int",
"char": "chr"
}
async def fetch_parameter_data(session: aiohttp.ClientSession, name: str, path: str) -> dict:
doc = await fetch_html(session, path)
soup = BeautifulSoup(doc, 'html.parser')
replace_images_with_alt(soup)
table = soup.find("div", class_="documentation__content").find("table")
data = {"name": name, "url": get_url(path) }
ty = table.find(string="Type:", recursive=True).parent.parent.find_next_sibling('td').text
data['ty'] = ty
data['dtype'] = _DTYPES[ty]
data['default'] = table.find(string="Default value:", recursive=True).parent.parent.find_next_sibling('td').text
if ty == 'int' or ty == "double":
data['min'] = table.find(string="Minimum value:", recursive=True).parent.parent.find_next_sibling('td').text
data['max'] = table.find(string="Maximum value:", recursive=True).parent.parent.find_next_sibling('td').text
documentation = _clean_documentation(table.find_next_sibling('p').text)
data['cl_only'] = "Note: Command-line only" in documentation
data['doc'] = documentation
return data
_OTYPES = {
"Model": "model",
"Multi-objective": "model",
"Multi-Scenario": "model",
"Quality": "model",
"Linear Constraint": "constr",
"Quadratic Constraint": "qconstr",
"SOS": "sos",
"Variable": "var",
"General Constraint": "gconstr",
"Batch": "batch",
}
async def fetch_attribute_data(session: aiohttp.ClientSession, name: str, path: str) -> dict:
doc = await fetch_html(session, path)
soup = BeautifulSoup(doc, 'html.parser')
replace_images_with_alt(soup)
table = soup.find("div", class_="documentation__content").find("table")
data = {"name": name, "url": get_url(path) }
ty = table.find(string="Type:", recursive=True).parent.parent.find_next_sibling('td').text
mod = table.find(string="Modifiable:", recursive=True).parent.parent.find_next_sibling('td').text
object_ty = table.parent\
.find(class_='navigation')\
.find('b', text=lambda x : "Up:" in x)\
.find_next_sibling("a")\
.text.replace(" Attributes", "")
data['ty'] = ty
data['dtype'] = _DTYPES[ty]
data['otype'] = _OTYPES[object_ty]
data['modifiable'] = mod.lower() == "yes"
documentation = _clean_documentation(table.find_next_sibling('p').text)
data['cl_only'] = "Note: Command-line only" in documentation
data['doc'] = documentation
return data
async def fetch_parameter_list(session: aiohttp.ClientSession):
html_doc = await fetch_html(session, "parameters.html")
soup = BeautifulSoup(html_doc, 'html.parser')
desc = soup.find("a", string="Parameter Descriptions")
items = desc.find_next_sibling('ul').find_all('a')
return {i.text: i.attrs['href'] for i in items}
async def fetch_attribute_list(session: aiohttp.ClientSession):
html_doc = await fetch_html(session, "attributes.html")
soup = BeautifulSoup(html_doc, 'html.parser')
attrlist = {}
for i in soup.find('ul', class_="ChildLinks").find_all('a'):
if 'Attributes' in i.text or 'Examples' in i.text:
continue
attrlist[i.text] = i.attrs['href']
return attrlist
def http_session():
http_headers = {'user-agent': "Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"}
return aiohttp.ClientSession(connector=aiohttp.TCPConnector(limit=20), headers=http_headers)
def replace_images_with_alt(soup):
for img in soup.find_all("img"):
try:
alt = img.attrs["alt"]
except KeyError:
continue
img.replace_with(BeautifulSoup(alt, "html.parser"))