#include <fstream>
#include <string>
#include <vector>
#include "common.h"
#include "feature_index.h"
#include "freelist.h"
#include "lbfgs.h"
#include "learner_tagger.h"
#include "param.h"
#include "string_buffer.h"
#include "thread.h"
#include "utils.h"
namespace MeCab {
namespace {
#define DCONF(file) create_filename(dicdir, std::string(file)).c_str()
#ifdef MECAB_USE_THREAD
class learner_thread: public thread {
public:
unsigned short start_i;
unsigned short thread_num;
size_t size;
size_t micro_p;
size_t micro_r;
size_t micro_c;
size_t err;
double f;
EncoderLearnerTagger **x;
std::vector<double> expected;
void run() {
micro_p = micro_r = micro_c = err = 0;
f = 0.0;
std::fill(expected.begin(), expected.end(), 0.0);
for (size_t i = start_i; i < size; i += thread_num) {
f += x[i]->gradient(&expected[0]);
err += x[i]->eval(µ_c, µ_p, µ_r);
}
}
};
#endif
class CRFLearner {
public:
static int run(Param *param) {
const std::string dicdir = param->get<std::string>("dicdir");
CHECK_DIE(param->load(DCONF(DICRC)))
<< "no such file or directory: " << DCONF(DICRC);
const std::vector<std::string> &files = param->rest_args();
if (files.size() != 2) {
std::cout << "Usage: " <<
param->program_name() << " corpus model" << std::endl;
return -1;
}
const std::string ifile = files[0];
const std::string model = files[1];
const std::string old_model = param->get<std::string>("old-model");
EncoderFeatureIndex feature_index;
std::vector<double> expected;
std::vector<double> observed;
std::vector<double> alpha;
std::vector<double> old_alpha;
std::vector<EncoderLearnerTagger *> x;
Tokenizer<LearnerNode, LearnerPath> tokenizer;
Allocator<LearnerNode, LearnerPath> allocator;
CHECK_DIE(tokenizer.open(*param)) << "cannot open tokenizer";
CHECK_DIE(feature_index.open(*param)) << "cannot open feature index";
if (!old_model.empty()) {
std::cout << "Using previous model: " << old_model << std::endl;
std::cout << "--cost --freq and --eta options are overwritten."
<< std::endl;
CHECK_DIE(tokenizer.dictionary_info());
const char *dic_charset = tokenizer.dictionary_info()->charset;
feature_index.reopen(old_model.c_str(),
dic_charset, &old_alpha, param);
}
const double C = param->get<double>("cost");
const double eta = param->get<double>("eta");
const size_t eval_size = param->get<size_t>("eval-size");
const size_t unk_eval_size = param->get<size_t>("unk-eval-size");
const size_t thread_num = param->get<size_t>("thread");
const size_t freq = param->get<size_t>("freq");
CHECK_DIE(C > 0) << "cost parameter is out of range: " << C;
CHECK_DIE(eta > 0) "eta is out of range: " << eta;
CHECK_DIE(eval_size > 0) << "eval-size is out of range: " << eval_size;
CHECK_DIE(unk_eval_size > 0) <<
"unk-eval-size is out of range: " << unk_eval_size;
CHECK_DIE(freq > 0) <<
"freq is out of range: " << unk_eval_size;
CHECK_DIE(thread_num > 0 && thread_num <= 512)
<< "# thread is invalid: " << thread_num;
std::cout.setf(std::ios::fixed, std::ios::floatfield);
std::cout.precision(5);
std::cout << "reading corpus ..." << std::flush;
std::ifstream ifs(WPATH(ifile.c_str()));
CHECK_DIE(ifs) << "no such file or directory: " << ifile;
while (ifs) {
EncoderLearnerTagger *tagger = new EncoderLearnerTagger();
CHECK_DIE(tagger->open(&tokenizer,
&allocator,
&feature_index,
eval_size,
unk_eval_size));
CHECK_DIE(tagger->read(&ifs, &observed));
if (!tagger->empty()) {
x.push_back(tagger);
} else {
delete tagger;
}
if (x.size() % 100 == 0) {
std::cout << x.size() << "... " << std::flush;
}
}
feature_index.shrink(freq, &observed);
feature_index.clearcache();
const size_t psize = feature_index.size();
observed.resize(psize);
expected.resize(psize);
alpha.resize(psize);
old_alpha.resize(psize);
alpha = old_alpha;
feature_index.set_alpha(&alpha[0]);
std::cout << std::endl;
std::cout << "Number of sentences: " << x.size() << std::endl;
std::cout << "Number of features: " << psize << std::endl;
std::cout << "eta: " << eta << std::endl;
std::cout << "freq: " << freq << std::endl;
std::cout << "eval-size: " << eval_size << std::endl;
std::cout << "unk-eval-size: " << unk_eval_size << std::endl;
#ifdef MECAB_USE_THREAD
std::cout << "threads: " << thread_num << std::endl;
#endif
std::cout << "charset: " <<
tokenizer.dictionary_info()->charset << std::endl;
std::cout << "C(sigma^2): " << C << std::endl
<< std::endl;
#ifdef MECAB_USE_THREAD
std::vector<learner_thread> thread;
if (thread_num > 1) {
thread.resize(thread_num);
for (size_t i = 0; i < thread_num; ++i) {
thread[i].start_i = i;
thread[i].size = x.size();
thread[i].thread_num = thread_num;
thread[i].x = &x[0];
thread[i].expected.resize(expected.size());
}
}
#endif
int converge = 0;
double prev_obj = 0.0;
LBFGS lbfgs;
for (size_t itr = 0; ; ++itr) {
std::fill(expected.begin(), expected.end(), 0.0);
double obj = 0.0;
size_t err = 0;
size_t micro_p = 0;
size_t micro_r = 0;
size_t micro_c = 0;
#ifdef MECAB_USE_THREAD
if (thread_num > 1) {
for (size_t i = 0; i < thread_num; ++i) {
thread[i].start();
}
for (size_t i = 0; i < thread_num; ++i) {
thread[i].join();
}
for (size_t i = 0; i < thread_num; ++i) {
obj += thread[i].f;
err += thread[i].err;
micro_r += thread[i].micro_r;
micro_p += thread[i].micro_p;
micro_c += thread[i].micro_c;
for (size_t k = 0; k < psize; ++k) {
expected[k] += thread[i].expected[k];
}
}
} else
#endif
{
for (size_t i = 0; i < x.size(); ++i) {
obj += x[i]->gradient(&expected[0]);
err += x[i]->eval(µ_c, µ_p, µ_r);
}
}
const double p = 1.0 * micro_c / micro_p;
const double r = 1.0 * micro_c / micro_r;
const double micro_f = 2 * p * r / (p + r);
for (size_t i = 0; i < psize; ++i) {
const double penalty = (alpha[i] - old_alpha[i]);
obj += (penalty * penalty / (2.0 * C));
expected[i] = expected[i] - observed[i] + penalty / C;
}
const double diff = (itr == 0 ? 1.0 :
std::fabs(1.0 * (prev_obj - obj)) / prev_obj);
std::cout << "iter=" << itr
<< " err=" << 1.0 * err/x.size()
<< " F=" << micro_f
<< " target=" << obj
<< " diff=" << diff << std::endl;
prev_obj = obj;
if (diff < eta) {
converge++;
} else {
converge = 0;
}
if (converge == 3) {
break; }
const int ret = lbfgs.optimize(psize,
&alpha[0], obj,
&expected[0], false, C);
CHECK_DIE(ret >= 0) << "unexpected error in LBFGS routin";
if (ret == 0) {
break;
}
}
std::cout << "\nDone! writing model file ... " << std::endl;
std::ostringstream oss;
oss << "eta: " << eta << std::endl;
oss << "freq: " << freq << std::endl;
oss << "C: " << C << std::endl;
oss.setf(std::ios::fixed, std::ios::floatfield);
oss.precision(16);
oss << "eval-size: " << eval_size << std::endl;
oss << "unk-eval-size: " << unk_eval_size << std::endl;
oss << "charset: " << tokenizer.dictionary_info()->charset << std::endl;
const std::string header = oss.str();
CHECK_DIE(feature_index.save(model.c_str(), header.c_str()))
<< "permission denied: " << model;
return 0;
}
};
class Learner {
public:
static bool run(int argc, char **argv) {
static const MeCab::Option long_options[] = {
{ "dicdir", 'd', ".", "DIR",
"set DIR as dicdir(default \".\" )" },
{ "old-model", 'M', 0, "FILE",
"set FILE as old CRF model file" },
{ "cost", 'c', "1.0", "FLOAT",
"set FLOAT for cost C for constraints violatoin" },
{ "freq", 'f', "1", "INT",
"set the frequency cut-off (default 1)" },
{ "eta", 'e', "0.00005", "DIR",
"set FLOAT for tolerance of termination criterion" },
{ "thread", 'p', "1", "INT", "number of threads(default 1)" },
{ "version", 'v', 0, 0, "show the version and exit" },
{ "help", 'h', 0, 0, "show this help and exit." },
{ 0, 0, 0, 0 }
};
Param param;
if (!param.open(argc, argv, long_options)) {
std::cout << param.what() << "\n\n" << COPYRIGHT
<< "\ntry '--help' for more information." << std::endl;
return -1;
}
if (!param.help_version()) {
return 0;
}
return CRFLearner::run(¶m);
}
};
}
}
int mecab_cost_train(int argc, char **argv) {
return MeCab::Learner::run(argc, argv);
}