#include <fstream>
#include "src/CmdLineOutput.hpp"
#include "src/PhraseExtract.hpp"
using opencc::Exception;
using opencc::PhraseExtract;
using opencc::UTF8StringSlice;
void Extract(const std::vector<std::string>& inputFiles,
const std::string& outputFile) {
std::ostringstream buffer;
for (const auto& inputFile : inputFiles) {
std::ifstream ifs(inputFile);
const std::string contents((std::istreambuf_iterator<char>(ifs)),
(std::istreambuf_iterator<char>()));
buffer << contents;
}
const std::string& text = buffer.str();
PhraseExtract extractor;
extractor.SetWordMaxLength(2);
extractor.SetPrefixSetLength(1);
extractor.SetSuffixSetLength(1);
extractor.Extract(text);
std::ofstream ofs(outputFile);
for (const auto& word : extractor.Words()) {
const PhraseExtract::Signals& signals = extractor.Signal(word);
const double entropy = signals.prefixEntropy + signals.suffixEntropy;
const double logProbablity = extractor.LogProbability(word);
ofs << word << " " << signals.frequency << " " << logProbablity << " "
<< signals.cohesion << " " << entropy << " " << signals.prefixEntropy
<< " " << signals.suffixEntropy << std::endl;
}
ofs.close();
}
int main(int argc, const char* argv[]) {
try {
TCLAP::CmdLine cmd("Open Chinese Convert (OpenCC) Phrase Extractor", ' ',
VERSION);
CmdLineOutput cmdLineOutput;
cmd.setOutput(&cmdLineOutput);
TCLAP::UnlabeledMultiArg<std::string> fileNames(
"fileName", "Input files", true , "files");
cmd.add(fileNames);
TCLAP::ValueArg<std::string> outputArg(
"o", "output", "Output file", true , "" ,
"file" , cmd);
cmd.parse(argc, argv);
Extract(fileNames.getValue(), outputArg.getValue());
} catch (TCLAP::ArgException& e) {
std::cerr << "error: " << e.error() << " for arg " << e.argId()
<< std::endl;
} catch (Exception& e) {
std::cerr << e.what() << std::endl;
}
return 0;
}