import requests
from bs4 import BeautifulSoup
import bs4
symbols = ["https://en.wikipedia.org/wiki/Currency_Symbols_(Unicode_block)",
"https://en.wikipedia.org/wiki/General_Punctuation",
"https://en.wikipedia.org/wiki/Letterlike_Symbols",
"https://en.wikipedia.org/wiki/Number_Forms",
"https://en.wikipedia.org/wiki/Miscellaneous_Symbols",
]
phonetic = ["https://en.wikipedia.org/wiki/IPA_Extensions",
"https://en.wikipedia.org/wiki/Spacing_Modifier_Letters",
"https://en.wikipedia.org/wiki/Phonetic_Extensions",
"https://en.wikipedia.org/wiki/Phonetic_Extensions_Supplement",
"https://en.wikipedia.org/wiki/Modifier_Tone_Letters",
"https://en.wikipedia.org/wiki/Superscripts_and_Subscripts",
]
enclosed = ["https://en.wikipedia.org/wiki/Enclosed_alphanumerics"]
enclosed_supplement = ["https://en.wikipedia.org/wiki/Enclosed_Alphanumeric_Supplement",
"https://en.wikipedia.org/wiki/Enclosed_Ideographic_Supplement",
]
arrows = ["https://en.wikipedia.org/wiki/Arrows_(Unicode_block)"]
arrows_supplement = ["https://en.wikipedia.org/wiki/Supplemental_Arrows-A",
"https://en.wikipedia.org/wiki/Supplemental_Arrows-B",
"https://en.wikipedia.org/wiki/Supplemental_Arrows-C",
]
dingbat = ["https://en.wikipedia.org/wiki/Dingbat"]
math = ["https://en.wikipedia.org/wiki/Mathematical_Operators",
"https://en.wikipedia.org/wiki/Supplemental_Mathematical_Operators",
"https://en.wikipedia.org/wiki/Miscellaneous_Mathematical_Symbols-A",
"https://en.wikipedia.org/wiki/Miscellaneous_Mathematical_Symbols-B",
]
tech = ["https://en.wikipedia.org/wiki/Miscellaneous_Technical"]
games = ["https://en.wikipedia.org/wiki/Mahjong_Tiles_(Unicode_block)",
"https://en.wikipedia.org/wiki/Domino_Tiles",
"https://en.wikipedia.org/wiki/Unicode_Playing_Card_Block",
]
box = ["https://en.wikipedia.org/wiki/Box_Drawing",
"https://en.wikipedia.org/wiki/Block_Elements",
"https://en.wikipedia.org/wiki/Geometric_Shapes",
]
box_supplement = ["https://en.wikipedia.org/wiki/Geometric_Shapes_Extended"]
categories = { "symbols": (symbols, "Some symbols"),
"enclosed": (enclosed, "Enclosed numbers and letters"),
"enclosed_supplement": (enclosed_supplement, "More enclosed numbers and letters"),
"arrows": (arrows, "Arrows"),
"arrows_supplement": (arrows_supplement, "More arrows"),
"dingbat": (dingbat, "Dingbat symbols"),
"math": (math, "Mathematical operators etc."),
"games": (games, "Mahjong tiles, dominos and cards"),
"box": (box, "Box drawing characters"),
"box_supplement": (box_supplement, "More geometric shapes"),
"tech": (tech, "Miscellaneous technical symbols"),
"phonetic": (phonetic, "Representation of the sounds of spoken language")
}
chars = {}
session = requests.Session() for category in categories:
valid = []
urls = categories[category][0]
desc = categories[category][1]
for url in urls:
html = session.get(url).text
soup = BeautifulSoup(html, 'html.parser')
tds = soup.find_all('td')
for item in tds:
if item.children.__length_hint__() == 1:
n = next(item.children)
else:
continue
text = ""
if type(n) == bs4.element.NavigableString:
text = item.get_text()
elif type(n) == bs4.element.Tag and n.name == "a":
text = n.get_text()
if text == "":
continue
try:
a = item["title"]
if len(text) == 1: valid.append(text)
except KeyError:
if len(text.split(' ')) == 3 and text.split(" ")[0].isdigit():
pass
chars[category] = (urls, valid, desc)
print("#![no_std]")
print("/// All ASCII characters except spacing")
print("pub const ASCII: &'static [char] = &[", end="")
for char in """!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~""":
if char == "'":
print("'\\'',", end="")
elif char == "\\":
print("'\\\\',", end="")
else:
print("'", char, "',", sep="", end="")
print("];")
print("/// ASCII spacing characters")
print("pub const ASCII_SPACE: &'static [char] = &[' ', '\\t', '\\n', '\\r'];")
for category in chars:
print("///", chars[category][2])
print("///")
print("/// Scraped from these wikipedia pages:")
print("///")
for url in chars[category][0]:
print("/// -", url)
print("///")
print("pub const", category.upper(), ": &'static [char] = &[", end="")
for char in chars[category][1]:
print("'", char, "',", sep='', end='')
print("];")
print("""
/// Assert that we got all characters from Wikipedia.
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn ascii() {
// letters + numbers + symbols
assert_eq!(ASCII.len(), 26 + 26 + 10 + 32);
assert_eq!(ASCII_SPACE.len(), 4);
}
#[test]
fn arrows() {
assert_eq!(ARROWS.len(), 112)
}
#[test]
fn arrows_supplement() {
assert_eq!(ARROWS_SUPPLEMENT.len(), 16 + 128 + 148)
}
#[test]
fn box_() { // box is keyword
assert_eq!(BOX.len(), 128 + 32 + 96)
}
#[test]
fn box_supplement() {
assert_eq!(BOX_SUPPLEMENT.len(), 85)
}
#[test]
fn dingbat() {
assert_eq!(DINGBAT.len(), 48 + 12 * 16)
}
#[test]
fn enclosed() {
assert_eq!(ENCLOSED.len(), 160)
}
#[test]
fn enclosed_supplement() {
// - regional indicators
assert_eq!(ENCLOSED_SUPPLEMENT.len(), 191 - 26 + 64)
}
#[test]
fn games() {
assert_eq!(GAMES.len(), 44 + 100 + 82)
}
#[test]
fn math() {
assert_eq!(MATH.len(), 256 + 256 + 48 + 128)
}
#[test]
fn symbols() {
// - spaces
assert_eq!(SYMBOLS.len(), 32 + 111 - 41 + 80 + 60 + 256)
}
#[test]
fn tech() {
assert_eq!(TECH.len(), 256)
}
#[test]
fn phonetic() {
assert_eq!(PHONETIC.len(), 96 + 80 + 128 + 64 + 32 + 42)
}
}
""")