unichars 0.0.2

Constant, categorized collections of characters
Documentation
#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup
import bs4

"""
Languages are messy and I am too lazy to support them :p
lang = [#"https://en.wikipedia.org/wiki/NKo_(Unicode_block)",
        #"https://en.wikipedia.org/wiki/Samaritan_(Unicode_block)",
        "https://en.wikipedia.org/wiki/Syriac_(Unicode_block)",
        #"https://en.wikipedia.org/wiki/Thaana_(Unicode_block)",
        "https://en.wikipedia.org/wiki/Tifinagh_(Unicode_block)",
        "https://en.wikipedia.org/wiki/Devanagari_(Unicode_block)",
        "https://en.wikipedia.org/wiki/Bengali_(Unicode_block)",
        "https://en.wikipedia.org/wiki/Gurmukhi_(Unicode_block)",
        "https://en.wikipedia.org/wiki/Gujarati_(Unicode_block)"
        ]

lang_bali = ["https://en.wikipedia.org/wiki/Balinese_(Unicode_block)"]
lang_bugi = ["https://en.wikipedia.org/wiki/Buginese_(Unicode_block)"]
lang_cher = ["https://en.wikipedia.org/wiki/Cherokee_(Unicode_block)"]
# Missing "Greek and Coptic" block
lang_copt = ["https://en.wikipedia.org/wiki/Coptic_(Unicode_block)",
    "https://en.wikipedia.org/wiki/Coptic_Epact_Numbers"]
# 256, missing a lot
lang_cyrl = ["https://en.wikipedia.org/wiki/Cyrillic_(Unicode_block)",
    "https://en.wikipedia.org/wiki/Cyrillic_Supplement"]
lang_geor = ["https://en.wikipedia.org/wiki/Georgian_(Unicode_block)"]
# 94 + 38
lang_glag = ["https://en.wikipedia.org/wiki/Glagolitic_(Unicode_block)",
    "https://en.wikipedia.org/wiki/Glagolitic_Supplement"]
# Missing "Greek and Coptic" block
# also missing "Combining and letter-free diacritics"
lang_grek = ["https://en.wikipedia.org/wiki/Greek_Extended"]
lang_mand = ["https://en.wikipedia.org/wiki/Mandaic_(Unicode_block)"]
"""

symbols = ["https://en.wikipedia.org/wiki/Currency_Symbols_(Unicode_block)",
           "https://en.wikipedia.org/wiki/General_Punctuation",
           "https://en.wikipedia.org/wiki/Letterlike_Symbols",
           "https://en.wikipedia.org/wiki/Number_Forms",
           "https://en.wikipedia.org/wiki/Miscellaneous_Symbols",
           ]

phonetic = ["https://en.wikipedia.org/wiki/IPA_Extensions",
            "https://en.wikipedia.org/wiki/Spacing_Modifier_Letters",
            "https://en.wikipedia.org/wiki/Phonetic_Extensions",
            "https://en.wikipedia.org/wiki/Phonetic_Extensions_Supplement",
            "https://en.wikipedia.org/wiki/Modifier_Tone_Letters",
            "https://en.wikipedia.org/wiki/Superscripts_and_Subscripts",
            ]

enclosed = ["https://en.wikipedia.org/wiki/Enclosed_alphanumerics"]
enclosed_supplement = ["https://en.wikipedia.org/wiki/Enclosed_Alphanumeric_Supplement",
                       "https://en.wikipedia.org/wiki/Enclosed_Ideographic_Supplement",
                      ]

arrows = ["https://en.wikipedia.org/wiki/Arrows_(Unicode_block)"]
          

arrows_supplement = ["https://en.wikipedia.org/wiki/Supplemental_Arrows-A",
                     "https://en.wikipedia.org/wiki/Supplemental_Arrows-B",
                     "https://en.wikipedia.org/wiki/Supplemental_Arrows-C",
                    ]

dingbat = ["https://en.wikipedia.org/wiki/Dingbat"]

math = ["https://en.wikipedia.org/wiki/Mathematical_Operators",
        "https://en.wikipedia.org/wiki/Supplemental_Mathematical_Operators",
        "https://en.wikipedia.org/wiki/Miscellaneous_Mathematical_Symbols-A",
        "https://en.wikipedia.org/wiki/Miscellaneous_Mathematical_Symbols-B",
        ]

tech = ["https://en.wikipedia.org/wiki/Miscellaneous_Technical"]

games = ["https://en.wikipedia.org/wiki/Mahjong_Tiles_(Unicode_block)",
         "https://en.wikipedia.org/wiki/Domino_Tiles",
         "https://en.wikipedia.org/wiki/Unicode_Playing_Card_Block",
         ]

box = ["https://en.wikipedia.org/wiki/Box_Drawing",
       "https://en.wikipedia.org/wiki/Block_Elements",
       "https://en.wikipedia.org/wiki/Geometric_Shapes",
       ]

box_supplement = ["https://en.wikipedia.org/wiki/Geometric_Shapes_Extended"]

categories = { "symbols": (symbols, "Some symbols"),
               "enclosed": (enclosed, "Enclosed numbers and letters"),
               "enclosed_supplement": (enclosed_supplement, "More enclosed numbers and letters"),
               "arrows": (arrows, "Arrows"),
               "arrows_supplement": (arrows_supplement, "More arrows"),
               "dingbat": (dingbat, "Dingbat symbols"),
               "math": (math, "Mathematical operators etc."),
               "games": (games, "Mahjong tiles, dominos and cards"),
               "box": (box, "Box drawing characters"),
               "box_supplement": (box_supplement, "More geometric shapes"),
               "tech": (tech, "Miscellaneous technical symbols"),
               "phonetic": (phonetic, "Representation of the sounds of spoken language")
               }

chars = {}
session = requests.Session() # speed stuff up
for category in categories:
    valid = []
    urls = categories[category][0]
    desc = categories[category][1]
    for url in urls:
        html = session.get(url).text

        soup = BeautifulSoup(html, 'html.parser')

        tds = soup.find_all('td')

        for item in tds:
            if item.children.__length_hint__() == 1:
                n = next(item.children)
            else:
                continue
            text = ""
            if type(n) == bs4.element.NavigableString:
                text = item.get_text()
            elif type(n) == bs4.element.Tag and n.name == "a":
                text = n.get_text()

            if text == "":
                continue
            
            try:
                a = item["title"]
                if len(text) == 1:# and text.isprintable():
                    valid.append(text)
            except KeyError:
                if len(text.split(' ')) == 3 and text.split(" ")[0].isdigit():
                    #print("expect", text.split(' ')[0])
                    pass
    chars[category] = (urls, valid, desc)


print("#![no_std]")
print("/// All ASCII characters except spacing")
print("pub const ASCII: &'static [char] = &[", end="")
for char in """!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~""":
    if char == "'":
        print("'\\'',", end="")
    elif char == "\\":
        print("'\\\\',", end="")
    else:
        print("'", char, "',", sep="", end="")
print("];")
print("/// ASCII spacing characters")
print("pub const ASCII_SPACE: &'static [char] = &[' ', '\\t', '\\n', '\\r'];")
for category in chars:
    print("///", chars[category][2])
    print("///")
    print("/// Scraped from these wikipedia pages:")
    print("///")
    for url in chars[category][0]:
        print("/// -", url)
        print("///")
    print("pub const", category.upper(), ": &'static [char] = &[", end="")
    for char in chars[category][1]:
        print("'", char, "',", sep='', end='')
    print("];")

print("""
/// Assert that we got all characters from Wikipedia.
#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn ascii() {
        // letters + numbers + symbols
        assert_eq!(ASCII.len(), 26 + 26 + 10 + 32);
        assert_eq!(ASCII_SPACE.len(), 4);
    }

    #[test]
    fn arrows() {
        assert_eq!(ARROWS.len(), 112)
    }

    #[test]
    fn arrows_supplement() {
        assert_eq!(ARROWS_SUPPLEMENT.len(), 16 + 128 + 148)
    }

    #[test]
    fn box_() { // box is keyword
        assert_eq!(BOX.len(), 128 + 32 + 96)
    }

    #[test]
    fn box_supplement() {
        assert_eq!(BOX_SUPPLEMENT.len(), 85)
    }

    #[test]
    fn dingbat() {
        assert_eq!(DINGBAT.len(), 48 + 12 * 16)
    }

    #[test]
    fn enclosed() {
        assert_eq!(ENCLOSED.len(), 160)
    }

    #[test]
    fn enclosed_supplement() {
        //                                        - regional indicators
        assert_eq!(ENCLOSED_SUPPLEMENT.len(), 191 - 26 + 64)
    }

    #[test]
    fn games() {
        assert_eq!(GAMES.len(), 44 + 100 + 82)
    }

    #[test]
    fn math() {
        assert_eq!(MATH.len(), 256 + 256 + 48 + 128)
    }

    #[test]
    fn symbols() {
        //                                 - spaces
        assert_eq!(SYMBOLS.len(), 32 + 111 - 41 + 80 + 60 + 256)
    }

    #[test]
    fn tech() {
        assert_eq!(TECH.len(), 256)
    }

    #[test]
    fn phonetic() {
        assert_eq!(PHONETIC.len(), 96 + 80 + 128 + 64 + 32 + 42)
    }
}
""")