#include "debug.h"
#include <stdio.h>
#include <string>
#include "cldutil.h"
#include "getonescriptspan.h"
#include "lang_script.h"
using namespace std;
namespace CLD2 {
string GetUniAt(const char* text) {
string retval;
retval.clear();
int uni_len = UniLen(text);
retval.append(text, uni_len);
return retval;
}
string GetBiAt(const char* text) {
string retval;
retval.clear();
int bi_len = BiLen(text);
retval.append(text, bi_len);
return retval;
}
string GetQuadAt(const char* text) {
string retval;
retval.clear();
if (text[-1] == ' ') {retval.append("_");}
int quad_len = QuadLen(text);
retval.append(text, quad_len);
if (text[quad_len] == ' ') {retval.append("_");}
return retval;
}
string GetOctaAt(const char* text) {
string retval;
retval.clear();
if (text[-1] == ' ') {retval.append("_");}
int octa_len = OctaLen(text);
retval.append(text, octa_len);
if (text[octa_len] == ' ') {retval.append("_");}
return retval;
}
string GetOcta2At(const char* text) {
string retval;
retval.clear();
if (text[-1] == ' ') {retval.append("_");}
int octa_len = OctaLen(text);
retval.append(text, octa_len);
if (text[octa_len] == ' ') {retval.append("_");}
text += (octa_len + 1);
int octa2_len = OctaLen(text);
retval.append(text, octa2_len);
if (text[octa2_len] == ' ') {retval.append("_");}
return retval;
}
string FmtLP(ULScript ulscript, uint8 pslang, uint8 qprob) {
string retval;
retval.clear();
Language lang = FromPerScriptNumber(ulscript, pslang);
char temp[16];
sprintf(temp, "%s.%d", LanguageCode(lang), qprob);
retval.append(temp);
return retval;
}
string GetLangProbTxt(const ScoringContext* scoringcontext, uint32 langprob) {
string retval;
retval.clear();
uint8 prob123 = (langprob >> 0) & 0xff;
const uint8* prob123_entry = LgProb2TblEntry(prob123);
uint8 top1 = (langprob >> 8) & 0xff;
if (top1 > 0) {
retval.append(FmtLP(scoringcontext->ulscript,
top1, LgProb3(prob123_entry, 0)));
}
uint8 top2 = (langprob >> 16) & 0xff;
if (top2 > 0) {
if (!retval.empty()) {retval.append("~");}
retval.append(FmtLP(scoringcontext->ulscript,
top2, LgProb3(prob123_entry, 1)));
}
uint8 top3 = (langprob >> 24) & 0xff;
if (top3 > 0) {
if (!retval.empty()) {retval.append("~");}
retval.append(FmtLP(scoringcontext->ulscript,
top3, LgProb3(prob123_entry, 2)));
}
return retval;
}
string GetScoreTxt(const ScoringContext* scoringcontext,
const CLD2TableSummary* base_obj, int indirect) {
string retval;
retval.clear();
if (indirect < static_cast<int>(base_obj->kCLDTableSizeOne)) {
uint32 langprob = base_obj->kCLDTableInd[indirect];
retval.append(GetLangProbTxt(scoringcontext, langprob));
} else {
indirect += (indirect - base_obj->kCLDTableSizeOne);
uint32 langprob = base_obj->kCLDTableInd[indirect];
uint32 langprob2 = base_obj->kCLDTableInd[indirect + 1];
retval.append(GetLangProbTxt(scoringcontext, langprob));
if (!retval.empty()) {retval.append("~");}
retval.append(GetLangProbTxt(scoringcontext, langprob2));
}
return retval;
}
static const int kLangBackground[16] = {
0xffd8d8, 0xf8ffd8, 0xd8ffe7, 0xd8f3ff,
0xefd8ff, 0xffd8eb, 0xfff7d8, 0xe3ffd8,
0xd8ffff, 0xe3d8ff, 0xffd8f7, 0xffebd8,
0xefffd8, 0xd8fff3, 0xd8e7ff, 0xf8d8ff,
};
static const int kLangColor[16] = {
0x000000, 0x7f2f00, 0x7f5f00, 0x6f7f00, 0x3f7f00, 0x0f7f00, 0x007f1f, 0x007f4f,
0x007f7f, 0x004f7f, 0x001f7f, 0x0f007f,
0x3f007f, 0x6f007f, 0x7f005f, 0x7f002f,
};
static const int kUnscoredText = 0xb0b0b0; static const int kUnscoredBackground = 0xffffff; static const int kIgnoremeText = 0x8090a0; static const int kIgnoremeBackground = 0xffeecc; static const int kEnglishBackground = 0xfffff4;
static int GetBackColor(Language lang, bool lighten) {
int retval;
if (lang == ENGLISH) {
retval = kEnglishBackground;
} else if (lang == UNKNOWN_LANGUAGE) {
retval = kUnscoredBackground;
} else if (lang == TG_UNKNOWN_LANGUAGE) {
retval = kIgnoremeBackground;
} else if (lang < 0) {
retval = kUnscoredBackground;
} else {
retval = kLangBackground[lang & 0x0f];
}
if (lighten) {
retval = (retval >> 1) | 0x808080;
}
return retval;
}
static int GetTextColor(Language lang, bool lighten) {
int retval;
if (lang == UNKNOWN_LANGUAGE) {
retval = kUnscoredText;
} else if (lang == TG_UNKNOWN_LANGUAGE) {
retval = kIgnoremeText;
} else if (lang < 0) {
retval = kUnscoredText;
} else {
retval = kLangColor[(lang >> 4) & 0x0f];
}
if (lighten) {
retval = (retval >> 1) | 0x808080;
}
return retval;
}
string GetPlainEscapedText(const string& txt) {
string retval;
retval.clear();
for (int i = 0; i < static_cast<int>(txt.size()); ++i) {
char c = txt[i];
if (c == '\n') {
retval.append(" ");
} else if (c == '\r') {
retval.append(" ");
} else {
retval.append(1, c);
}
}
return retval;
}
string GetHtmlEscapedText(const string& txt) {
string retval;
retval.clear();
for (int i = 0; i < static_cast<int>(txt.size()); ++i) {
char c = txt[i];
if (c == '<') {
retval.append("<");
} else if (c == '>') {
retval.append(">");
} else if (c == '&') {
retval.append("&");
} else if (c == '\'') {
retval.append("'");
} else if (c == '"') {
retval.append(""");
} else if (c == '\n') {
retval.append(" ");
} else if (c == '\r') {
retval.append(" ");
} else {
retval.append(1, c);
}
}
return retval;
}
string GetColorHtmlEscapedText(Language lang, const string& txt) {
char temp[64];
sprintf(temp, " <span style=\"background:#%06X;color:#%06X;\">\n",
GetBackColor(lang, false),
GetTextColor(lang, false));
string esc_txt = string(temp);
esc_txt.append(GetHtmlEscapedText(txt));
esc_txt.append("</span>");
return esc_txt;
}
string GetLangColorHtmlEscapedText(Language lang, const string& txt) {
char temp[64];
sprintf(temp, "[%s]", LanguageCode(lang));
string esc_txt = string(temp);
esc_txt.append(GetColorHtmlEscapedText(lang, txt));
return esc_txt;
}
void CLD2_Debug(const char* text,
int lo_offset,
int hi_offset,
bool more_to_come, bool score_cjk,
const ScoringHitBuffer* hitbuffer,
const ScoringContext* scoringcontext,
const ChunkSpan* cspan,
const ChunkSummary* chunksummary) {
FILE* df = scoringcontext->debug_file;
if (df == NULL) {return;}
if (scoringcontext->flags_cld2_verbose &&
(hitbuffer != NULL) &&
(cspan != NULL) && (hitbuffer->next_linear > 0)) {
int base_limit = cspan->chunk_base + cspan->base_len;
for (int i = cspan->chunk_base; i < base_limit; ++i) {
int ngram_start = hitbuffer->linear[i].offset;
uint32 langprob = hitbuffer->linear[i].langprob;
string ngram_text;
switch (hitbuffer->linear[i].type) {
case UNIHIT:
ngram_text = GetUniAt(&text[ngram_start]);
break;
case QUADHIT:
ngram_text = GetQuadAt(&text[ngram_start]);
break;
case DELTAHIT:
case DISTINCTHIT:
if (score_cjk) {
ngram_text = GetBiAt(&text[ngram_start]);
} else {
ngram_text = GetOctaAt(&text[ngram_start]);
}
break;
}
string score_text = GetLangProbTxt(scoringcontext, langprob);
fprintf(df, "%c:%s=%s ",
"UQLD"[hitbuffer->linear[i].type],
ngram_text.c_str(),
score_text.c_str());
}
fprintf(df, "<br>\n");
const LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
const LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn;
const LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn;
if (scoringcontext->ulscript != ULScript_Latin) {
langprior_boost = &scoringcontext->langprior_boost.othr;
langprior_whack = &scoringcontext->langprior_whack.othr;
distinct_boost = &scoringcontext->distinct_boost.othr;
}
fprintf(df, "LangPrior_boost: ");
for (int k = 0; k < kMaxBoosts; ++k) {
uint32 langprob = langprior_boost->langprob[k];
if (langprob > 0) {
fprintf(df, "%s ",
GetLangProbTxt(scoringcontext, langprob).c_str());
}
}
fprintf(df, "LangPrior_whack: ");
for (int k = 0; k < kMaxBoosts; ++k) {
uint32 langprob = langprior_whack->langprob[k];
if (langprob > 0) {
fprintf(df, "%s ",
GetLangProbTxt(scoringcontext, langprob).c_str());
}
}
fprintf(df, "Distinct_boost: ");
for (int k = 0; k < kMaxBoosts; ++k) {
uint32 langprob = distinct_boost->langprob[k];
if (langprob > 0) {
fprintf(df, "%s ",
GetLangProbTxt(scoringcontext, langprob).c_str());
}
}
fprintf(df, "<br>\n");
fprintf(df, "%s.%d %s.%d %dB %d# %s %dRd %dRs<br>\n",
LanguageCode(static_cast<Language>(chunksummary->lang1)),
chunksummary->score1,
LanguageCode(static_cast<Language>(chunksummary->lang2)),
chunksummary->score2,
chunksummary->bytes,
chunksummary->grams,
ULScriptCode(static_cast<ULScript>(chunksummary->ulscript)),
chunksummary->reliability_delta,
chunksummary->reliability_score);
}
bool is_reliable = true;
bool match_prior = false;
int reliable = CLD2::minint(chunksummary->reliability_delta,
chunksummary->reliability_score);
is_reliable = (reliable >= 75);
match_prior = (chunksummary->lang1 == scoringcontext->prior_chunk_lang);
if (!is_reliable) {match_prior = false;}
if (match_prior) {
fprintf(df, "[]");
} else if (is_reliable) {
fprintf(df, "[%s]",
LanguageCode(static_cast<Language>(chunksummary->lang1)));
} else {
fprintf(df, "[%s*.%d/%s.%d]",
LanguageCode(static_cast<Language>(chunksummary->lang1)),
chunksummary->score1,
LanguageCode(static_cast<Language>(chunksummary->lang2)),
chunksummary->score2);
}
int chunktext_len = hi_offset - lo_offset;
if (chunktext_len < 0) {
chunktext_len = 0;
fprintf(df, " LEN_ERR hi %d lo %d<br>\n", hi_offset, lo_offset);
}
string chunk_text(&text[lo_offset], chunktext_len);
Language lang = static_cast<Language>(chunksummary->lang1);
fprintf(df, " <span style=\"background:#%06X;color:#%06X;\">\n",
GetBackColor(lang, false),
GetTextColor(lang, false));
fprintf(df, "%s", chunk_text.c_str());
if (scoringcontext->flags_cld2_cr) {
fprintf(df, "</span><br>\n");
} else {
fprintf(df, "</span> \n");
}
}
void CLD2_Debug2(const char* text,
bool more_to_come, bool score_cjk,
const ScoringHitBuffer* hitbuffer,
const ScoringContext* scoringcontext,
const SummaryBuffer* summarybuffer) {
FILE* df = scoringcontext->debug_file;
if (df == NULL) {return;}
uint16 prior_chunk_lang = static_cast<uint16>(UNKNOWN_LANGUAGE);
for (int i = 0; i < summarybuffer->n; ++i) {
fprintf(df, "Debug2[%d] ", i);
const ChunkSummary* chunksummary = &summarybuffer->chunksummary[i];
bool is_reliable = true;
bool match_prior = false;
int reliable = CLD2::minint(chunksummary->reliability_delta,
chunksummary->reliability_score);
is_reliable = (reliable >= 75);
match_prior = (chunksummary->lang1 == prior_chunk_lang);
if (!is_reliable) {match_prior = false;}
if (match_prior) {
fprintf(df, "[]");
} else if (is_reliable) {
fprintf(df, "[%s]",
LanguageCode(static_cast<Language>(chunksummary->lang1)));
} else {
fprintf(df, "[%s*.%d/%s.%d]",
LanguageCode(static_cast<Language>(chunksummary->lang1)),
chunksummary->score1,
LanguageCode(static_cast<Language>(chunksummary->lang2)),
chunksummary->score2);
}
int lo_offset = chunksummary->offset;
int chunktext_len = chunksummary->bytes;
string chunk_text(&text[lo_offset], chunktext_len);
Language lang = static_cast<Language>(chunksummary->lang1);
fprintf(df, " <span style=\"background:#%06X;color:#%06X;\">\n",
GetBackColor(lang, false),
GetTextColor(lang, false));
fprintf(df, "%s", chunk_text.c_str());
if (scoringcontext->flags_cld2_cr) {
fprintf(df, "</span><br>\n");
} else {
fprintf(df, "</span> \n");
}
prior_chunk_lang = chunksummary->lang1;
}
}
void DumpResultChunkVector(FILE* f, const char* src,
ResultChunkVector* resultchunkvector) {
fprintf(f, "DumpResultChunkVector[%ld]<br>\n", resultchunkvector->size());
for (int i = 0; i < static_cast<int>(resultchunkvector->size()); ++i) {
ResultChunk* rc = &(*resultchunkvector)[i];
Language lang1 = static_cast<Language>(rc->lang1);
string this_chunk = string(src, rc->offset, rc->bytes);
fprintf(f, "[%d]{%d %d %s} ", i, rc->offset, rc->bytes, LanguageCode(lang1));
fprintf(f, "%s<br>\n", GetColorHtmlEscapedText(lang1, this_chunk).c_str());
}
fprintf(f, "<br>\n");
}
}