import urllib.request
import time
from pathlib import Path
URL = "http://www.kilgarriff.co.uk/BNClists/lemma.num"
ALPHABET = "abcdefghijklmnopqrstuvwxyz"
WORD_LIMIT_HIGH = 144000
WORD_LIMIT_LOW = 72000
BASE_SCORE = 1270
def init_lemma(override=True, path="lemma.txt"):
lemma = Path(path)
if not lemma.is_file() or override:
urllib.request.urlretrieve(URL, path)
def process_words(override=True, skip_score=False, path="lemma.txt", dest_path="words.txt"):
dest_file = Path(dest_path)
source_file = Path(path)
if not source_file.is_file():
print("Error: can't locate the source file")
return
if dest_file.is_file() and not override:
return
with open(path, 'r') as f:
with open(dest_path, 'w') as df:
lines = f.readlines()
for line in lines:
word_info = line.split(' ')
if word_info[2] == "": continue
if skip_score:
word = word_info[2]
else:
word = word_info[2] + "," + word_info[1]
df.write(word + '\n')
df.close()
f.close()
def process_n_cut(override=True, cut_level=2, path="freq_50k.txt", dest_path="freq_50k.txt"):
dest_file = Path(dest_path)
source_file = Path(path)
if not source_file.is_file():
print("Error: can't locate the source file")
return
if dest_file.is_file() and not override:
return
with open(path, 'r') as f:
with open(dest_path, 'w') as df:
count = 0
lines = f.readlines()
for line in lines:
if line == "" or line == "word,count\n":
continue
word_info = line.replace("\n", "").split(',')
if word_info[0] != "" and word_info[1] != "":
score = int(int(word_info[1]) / BASE_SCORE)
df.write(word_info[0] + "," + str(score) + "\n")
count += 1
if cut_level == 0 and count == WORD_LIMIT_LOW:
break
elif cut_level == 1 and count == WORD_LIMIT_HIGH:
break
print(count)
df.close()
f.close()
def process_n_save_reverse(override=True, find_all=True, path="./en-us/freq_50k.txt", dest_path="./en-us/freq_50k_precalc_rev.txt"):
source_file = Path(path)
dest_file = Path(dest_path)
if not source_file.is_file():
print("Error: can't locate the source file")
return
if dest_file.is_file() and not override:
return
dict = set()
with open(path, 'r') as f:
with open(dest_path, 'w') as df:
print("Info: process begins...\n")
count = 0
lines = f.readlines()
for line in lines:
if line == "" or line == "word,count\n":
continue
word_info = line.replace("\n", "").split(',')
if word_info[0] != "" and word_info[1] != "" and word_info[0] not in dict:
dict.add(word_info[0])
ts = time.gmtime()
print(time.strftime("%H:%M:%S", ts), len(dict), "\n")
rev_dict = {}
for word in dict:
neighbors = words_in_one_edit(word, dict, find_all)
if len(neighbors) == 0:
continue
for neighbor in neighbors:
if neighbor in rev_dict:
rev_dict[neighbor] = rev_dict[neighbor] + ";" + word
else:
rev_dict[neighbor] = word
count += 1
if count % 500 == 0:
ts = time.gmtime()
print(time.strftime("%H:%M:%S", ts), " - ", count, "\n")
for (rev_word, linked) in rev_dict.items():
result = rev_word + "^" + linked + "\n"
df.write(result)
df.close()
f.close()
return dict
def process_n_save(override=True, find_all=True, path="./en-us/freq_50k.txt", dest_path="./en-us/freq_50k_precalc.txt"):
source_file = Path(path)
dest_file = Path(dest_path)
if not source_file.is_file():
print("Error: can't locate the source file")
return
if dest_file.is_file() and not override:
return
dict = set()
with open(path, 'r') as f:
with open(dest_path, 'w') as df:
print("Info: process begins...\n")
count = 0
lines = f.readlines()
for line in lines:
if line == "" or line == "word,count\n":
continue
word_info = line.replace("\n", "").split(',')
if word_info[0] != "" and word_info[1] != "" and word_info[0] not in dict:
dict.add(word_info[0])
ts = time.gmtime()
print(time.strftime("%H:%M:%S", ts), len(dict), "\n")
for word in dict:
neighbors = words_in_one_edit(word, dict, find_all)
if len(neighbors) == 0:
continue
result = word + "^"
for neighbor in neighbors:
if len(neighbor) > 0:
result = result + neighbor + ";"
result = result + "\n"
df.write(result)
count += 1
if count % 500 == 0:
ts = time.gmtime()
print(time.strftime("%H:%M:%S", ts), " - ", count, "\n")
df.close()
f.close()
return dict
def words_in_one_edit(source, dict, find_all):
result = set()
length = len(source)
for i in range(length+1):
if length > 1:
delete = source[0:i] + source[i+1:]
check_and_add(delete, source, dict, result, find_all)
if i+2 <= length:
swap = source[0:i] + source[i+1] + source[i] + source[i+2:]
check_and_add(swap, source, dict, result, find_all)
for letter in ALPHABET:
insert = source[0:i] + letter + source[i:]
check_and_add(insert, source, dict, result, find_all)
replace = source[0:i] + letter + source[i+1:]
check_and_add(replace, source, dict, result, find_all)
return result
def check_and_add(target, source, dict, result, find_all):
if target != source and target not in result:
if find_all or target in dict:
result.add(target)
if __name__ == '__main__':
process_n_save()
print("\nDone...\n")