from googletrans import Translator
GoogleTranslate = Translator(service_urls=["translate.google.us"])
def translate_char(ch:str, ignore_ch: bool, en_text: str, mathplayer: dict, sre: dict, google: dict):
mp_trans = mathplayer[ch] if ch in mathplayer else ''
sre_trans = sre[ch] if ch in sre else ''
google_trans = ''
if ignore_ch or mp_trans != sre_trans or mp_trans == '':
en_text = en_text.replace("eigh", "a").replace("Eigh", "A").replace("cap", "uppercase").replace("paren", "parenthesis")
if ignore_ch:
mp_trans = ''
sre_trans = ''
if len(en_text) > 1:
google_trans = google[en_text]
else:
google_trans = ch
return (mp_trans, sre_trans, google_trans)
import re
TextToTranslate = re.compile('t: "([^"]+)"')
def translate_char_line(ch: str, line:str, mathplayer: dict, sre: dict, google: dict):
result = {}
def do_translate_char(match_obj):
if match_obj:
alternatives = []
ignore_ch = line.find('then:') >= 0 mp_trans, sre_trans, google_trans = translate_char(ch, ignore_ch, match_obj.group(1), mathplayer, sre, google)
translation = google_trans
if mp_trans == sre_trans and mp_trans: translation = mp_trans
elif google_trans == mp_trans: translation = mp_trans
if sre_trans and sre_trans != mp_trans:
alternatives.append("SRE: '{}'".format(sre_trans))
elif google_trans == sre_trans: translation = sre_trans
if mp_trans: alternatives.append("MathPlayer: '{}'".format(mp_trans))
elif sre_trans:
translation = sre_trans
if mp_trans:
alternatives.append("MathPlayer: '{}'".format(mp_trans))
alternatives.append( "google: '{}'".format(google_trans) )
elif mp_trans:
translation = mp_trans
if sre_trans:
alternatives.append("SRE: '{}'".format(sre_trans))
alternatives.append( "google: '{}'".format(google_trans) )
else: translation = google_trans
result['original'] = match_obj.group(1)
result['translation'] = translation
result['alternatives'] = alternatives
return 't: "{}"'.format(translation)
else:
return line
return ( line if line.lstrip().startswith('#') else TextToTranslate.sub(do_translate_char, line), result )
CharDefStart = re.compile('[^-]*- "([^"])"') def get_next_char_def(lines: list):
iStart = 1
while iStart < len(lines):
if CharDefStart.match(lines[iStart]):
return lines[:iStart]
iStart += 1
return lines
def gather_words_in_char_def(lines: list, lang: str, mathplayer: dict, sre: dict, words_to_translate: set):
def gather_words_for_text(ch: str, en_text:str, lang: str, mathplayer: dict, sre: dict, words_to_translate: set):
mp_trans = mathplayer[ch] if ch in mathplayer else ''
sre_trans = sre[ch] if ch in sre else ''
google_trans = ''
if mp_trans != sre_trans or mp_trans == '': en_text = en_text.replace("eigh", "a").replace("Eigh", "A").replace("cap", "uppercase").replace("paren", "parenthesis")
if len(en_text) > 1:
words_to_translate.add(en_text)
ch_match = CharDefStart.match(lines[0])
ch = ch_match.group(1) if ch_match else ''
for line in lines:
en_text = TextToTranslate.search(line) if en_text:
ch_for_line = '' if line.find('then:') else ch
gather_words_for_text(ch_for_line, en_text.group(1), lang, mathplayer, sre, words_to_translate)
return words_to_translate
def process_char_def(lines: list, mathplayer: dict, sre: dict, google: dict, out_stream):
match = CharDefStart.match(lines[0])
ch = match.group(1) if match else ''
for line in lines:
translated_line, details = translate_char_line(ch, line, mathplayer, sre, google)
if translated_line:
i_comment_char = translated_line.find('#')
if i_comment_char > 0 and translated_line.find('"#"') >= 0:
i_comment_char = translated_line.find('#', i_comment_char+1)
comment = ''
if i_comment_char > 0 and not(translated_line.lstrip().startswith('#')):
comment = translated_line[i_comment_char+1:].rstrip()
translated_line = translated_line[:i_comment_char-1]
if 'alternatives' in details:
alternatives = details['alternatives']
if details['original'] != details['translation']:
alternatives.insert(0, "en: '{}'".format(details['original']))
if alternatives != []:
comment += '\t(' + alternatives[0]
for str in alternatives[1:]:
comment += ", " + str
comment += ')'
if comment:
translated_line = "{:<48s}\t# {}\n".format(translated_line.rstrip(), comment)
out_stream.write(
(translated_line if ch else line)
)
def collect_words_to_translate(file_to_translate: str, lang: str, mathplayer: dict, sre: dict):
with open(file_to_translate, 'r', encoding='utf8') as in_stream:
lines = in_stream.readlines()
iLine = 0
words_to_translate = set()
while iLine < len(lines):
char_def_lines = get_next_char_def(lines[iLine:])
if len(char_def_lines) == 0:
break
gather_words_in_char_def(char_def_lines, lang, mathplayer, sre, words_to_translate)
iLine += len(char_def_lines)
return words_to_translate
MAX_CHARS_IN_CHUNK = 4500
import time
def translate_words(words_to_translate, lang):
translations = {}
def do_translation_chunk(words: list):
word_string = ".\n".join(words)
translated_words = GoogleTranslate.translate(word_string, src='en', dest=lang).text.lower()
translated_words = translated_words.split('.\n')
if len(translated_words) != len(words_to_translate):
print("\n!!!Problem in translation: size of translations ({}) differs from words to translate ({})\n".format(len(translated_words), len(words_to_translate)))
for (orig, translation) in zip(words, translated_words):
translations[orig] = translation
word_list = set(words_to_translate)
char_count = 0
words_to_translate = []
for word in word_list:
words_to_translate.append(word)
char_count += len(word)
if char_count >= MAX_CHARS_IN_CHUNK:
do_translation_chunk(words_to_translate)
print("Translated {} words...".format(len(words_to_translate)))
char_count = 0
words_to_translate = []
time.sleep(2) do_translation_chunk(words_to_translate)
return translations
def create_new_file(file_to_translate: str, output_file: str, mathplayer: dict, sre: dict, google: dict):
with open(file_to_translate, 'r', encoding='utf8') as in_stream:
with open(output_file, 'w', encoding='utf8') as out_stream:
lines = in_stream.readlines()
iLine = 0
while iLine < len(lines):
char_def_lines = get_next_char_def(lines[iLine:])
if len(char_def_lines) == 0:
break
process_char_def(char_def_lines, mathplayer, sre, google, out_stream)
iLine += len(char_def_lines)
def build_new_translation(path_to_mathcat: str, lang: str, unicode_file_name: str):
sre = get_sre_unicode_dict(SRE_Location, lang)
mathplayer = get_mathplayer_unicode_dict(MP_Location, lang)
file_lang_to_translate = lang if lang=='vi' or lang=='id' else 'en' file_to_translate = "{}/Rules/Languages/{}/{}.yaml".format(path_to_mathcat, file_lang_to_translate, unicode_file_name)
words_to_translate = collect_words_to_translate(file_to_translate, lang, mathplayer, sre)
google = translate_words(words_to_translate, lang)
print("Translations: MathPlayer={}, SRE={}, Google={}".format(len(mathplayer), len(sre), len(google)))
create_new_file(file_to_translate, "{}-{}.yaml".format(unicode_file_name, lang), mathplayer, sre, google)
import os
import json
def get_sre_unicode_dict(path:str, lang: str):
try:
dict= {}
path += "\\" + lang + "\\" + "symbols" + "\\"
for filename in os.listdir(path):
with open(path+filename, 'r', encoding='utf8') as in_stream:
sre_data = json.load(in_stream)
for sre_entry in sre_data:
if "key" in sre_entry and "default" in sre_entry["mappings"]:
key = chr(int(sre_entry["key"], base=16))
dict[key] = sre_entry["mappings"]["default"]["default"]
return dict
except:
return {}
MP_Pattern = re.compile(r'.*?\(unicode == 0x([0-9A-Fa-f]{4,5})\).*?"([^"]+)".*?')
def get_mathplayer_unicode_dict(path: str, lang: str):
path += "\\" + lang + "\\"
try:
dict= {}
with open(path+"unicode.tdl", 'r', encoding='utf8') as in_stream:
lines = in_stream.readlines()
for line in lines:
matches = MP_Pattern.match(line)
if matches:
int_key = int(matches.group(1), base=16)
text = matches.group(2).strip()
if (int_key < 0xE000 or int_key > 0xF8FF) and text:
key = chr(int_key)
dict[key] = text
return dict
except:
return {}
def dict_compare(lang: str, sre: dict, mp: dict):
sre_keys = set(sre.keys())
mp_keys = set(mp.keys())
shared_keys = sre_keys.intersection(mp_keys)
sre_only = sre_keys - mp_keys
mp_only = mp_keys - sre_keys
differ = {o : (sre[o], mp[o]) for o in shared_keys if sre[o] != mp[o]}
same = set(o for o in shared_keys if sre[o] == mp[o])
with open("diffs-{}.txt".format(lang), 'w', encoding='utf8') as out_stream:
def print_dict(name, dict):
out_stream.write("\n\n---{}---\n".format(name))
for key in dict:
out_stream.write(" {}({:0>4x})={}\n".format(key, ord(key), dict[key]))
def print_set(name, set, orig_dict):
out_stream.write("\n---{}---\n".format(name))
for key in set:
out_stream.write(" {}({:0>4x})='{}'\n".format(key, ord(key), orig_dict[key]))
out_stream.write("sre/mp #chars={}/{}, #same={}, #differ={}, #only sre/mp={}/{}"
.format(len(sre), len(mp), len(same), len(differ), len(sre_only), len(mp_only) ))
print_dict("differ", differ)
print_set("sre_only", sre_only, sre)
print_set("mp_only", mp_only, mp)
return (sre_only, mp_only, differ, same)
import sys
sys.stdout.reconfigure(encoding='utf-8')
SRE_Location = r"C:\Dev\speech-rule-engine\mathmaps"
MP_Location = r"C:\Dev\mathplayer\EqnLib\rules\pvt"
build_new_translation("..", "vi", "unicode-full")