auto_correct 0.1.9

import urllib.request
import time
from pathlib import Path

URL = "http://www.kilgarriff.co.uk/BNClists/lemma.num"
ALPHABET = "abcdefghijklmnopqrstuvwxyz"

WORD_LIMIT_HIGH = 144000
WORD_LIMIT_LOW = 72000
BASE_SCORE = 1270

def init_lemma(override=True, path="lemma.txt"):
    lemma = Path(path)
    if not lemma.is_file() or override:
        urllib.request.urlretrieve(URL, path)


def process_words(override=True, skip_score=False, path="lemma.txt", dest_path="words.txt"):
    dest_file = Path(dest_path)
    source_file = Path(path)

    if not source_file.is_file():
        print("Error: can't locate the source file")
        return

    if dest_file.is_file() and not override:
        return

    with open(path, 'r') as f:
        with open(dest_path, 'w') as df:
            lines = f.readlines()

            for line in lines:
                word_info = line.split(' ')

                if word_info[2] == "": continue

                if skip_score:
                    word = word_info[2]
                else:
                    word = word_info[2] + "," + word_info[1]

                df.write(word + '\n')

            df.close()

        f.close()


def process_n_cut(override=True, cut_level=2, path="freq_50k.txt", dest_path="freq_50k.txt"):
    dest_file = Path(dest_path)
    source_file = Path(path)

    if not source_file.is_file():
        print("Error: can't locate the source file")
        return

    if dest_file.is_file() and not override:
        return

    with open(path, 'r') as f:
        with open(dest_path, 'w') as df:
            count = 0
            lines = f.readlines()

            for line in lines:
                if line == "" or line == "word,count\n":
                    continue

                word_info = line.replace("\n", "").split(',')

                if word_info[0] != "" and word_info[1] != "":
                    score = int(int(word_info[1]) / BASE_SCORE)
                    df.write(word_info[0] + "," + str(score) + "\n")

                count += 1

                if cut_level == 0  and count == WORD_LIMIT_LOW:
                    break
                elif cut_level == 1 and count == WORD_LIMIT_HIGH:
                    break

            print(count)
            df.close()

        f.close()


def process_n_save_reverse(override=True, find_all=True, path="./en-us/freq_50k.txt", dest_path="./en-us/freq_50k_precalc_rev.txt"):
    source_file = Path(path)
    dest_file = Path(dest_path)

    if not source_file.is_file():
        print("Error: can't locate the source file")
        return

    if dest_file.is_file() and not override:
        return

    dict = set()
    with open(path, 'r') as f:
        with open(dest_path, 'w') as df:
            print("Info: process begins...\n")

            count = 0
            lines = f.readlines()

            for line in lines:
                if line == "" or line == "word,count\n":
                    continue

                word_info = line.replace("\n", "").split(',')

                if word_info[0] != "" and word_info[1] != "" and word_info[0] not in dict:
                    dict.add(word_info[0])

            ts = time.gmtime()
            print(time.strftime("%H:%M:%S", ts), len(dict), "\n")

            rev_dict = {}
            for word in dict:
                neighbors = words_in_one_edit(word, dict, find_all)
                if len(neighbors) == 0:
                    continue
                
                for neighbor in neighbors:
                    if neighbor in rev_dict:
                        rev_dict[neighbor] = rev_dict[neighbor] + ";" + word
                    else:
                        rev_dict[neighbor] = word

                count += 1
                if count % 500 == 0:
                    ts = time.gmtime()
                    print(time.strftime("%H:%M:%S", ts), " - ", count, "\n")

            for (rev_word, linked) in rev_dict.items():
                result = rev_word + "^" + linked + "\n"
                df.write(result)

            df.close()

        f.close()

    return dict


def process_n_save(override=True, find_all=True, path="./en-us/freq_50k.txt", dest_path="./en-us/freq_50k_precalc.txt"):
    source_file = Path(path)
    dest_file = Path(dest_path)

    if not source_file.is_file():
        print("Error: can't locate the source file")
        return

    if dest_file.is_file() and not override:
        return

    dict = set()
    with open(path, 'r') as f:
        with open(dest_path, 'w') as df:
            print("Info: process begins...\n")

            count = 0
            lines = f.readlines()

            for line in lines:
                if line == "" or line == "word,count\n":
                    continue

                word_info = line.replace("\n", "").split(',')

                if word_info[0] != "" and word_info[1] != "" and word_info[0] not in dict:
                    dict.add(word_info[0])

            ts = time.gmtime()
            print(time.strftime("%H:%M:%S", ts), len(dict), "\n")

            for word in dict:
                neighbors = words_in_one_edit(word, dict, find_all)
                if len(neighbors) == 0:
                    continue
                
                result = word + "^"
                for neighbor in neighbors:
                    if len(neighbor) > 0:
                        result = result + neighbor + ";"

                result = result + "\n"
                df.write(result)

                count += 1
                if count % 500 == 0:
                    ts = time.gmtime()
                    print(time.strftime("%H:%M:%S", ts), " - ", count, "\n")

            df.close()

        f.close()

    return dict


def words_in_one_edit(source, dict, find_all):
    result = set()
    length = len(source)

    for i in range(length+1):
        if length > 1:
            # delete...
            delete = source[0:i] + source[i+1:]
            check_and_add(delete, source, dict, result, find_all)

            # swap...
            if i+2 <= length:
                swap = source[0:i] + source[i+1] + source[i] + source[i+2:]
                check_and_add(swap, source, dict, result, find_all)

        for letter in ALPHABET:
            # insert...
            insert = source[0:i] + letter + source[i:]
            check_and_add(insert, source, dict, result, find_all)

            # replace...
            replace = source[0:i] + letter + source[i+1:]
            check_and_add(replace, source, dict, result, find_all)

    return result


def check_and_add(target, source, dict, result, find_all):
    if target != source and target not in result:
        if find_all or target in dict:
            result.add(target)


if __name__ == '__main__':
    '''Standard words processing
    init_lemma(False)
    process_words(dest_path="words2.txt")
    '''

    '''Processing with different level of dictionary entry numbers
    process_n_cut(cut_level=0, dest_path="uniq_low.txt")
    '''

    process_n_save()
    #process_n_save_reverse()

    print("\nDone...\n")