#include "MaxMatchSegmentation.hpp"
#include "PrefixMatch.hpp"
#include "Segments.hpp"
#include "UTF8Util.hpp"
using namespace opencc;
MaxMatchSegmentation::MaxMatchSegmentation(const DictPtr _dict)
: dict(_dict), prefixMatch(new PrefixMatch(_dict)) {}
SegmentsPtr MaxMatchSegmentation::Segment(const std::string& text) const {
SegmentsPtr segments(new Segments);
const char* segStart = text.c_str();
size_t segLength = 0;
auto clearBuffer = [&segments, &segStart, &segLength]() {
if (segLength > 0) {
segments->AddSegment(UTF8Util::FromSubstr(segStart, segLength));
segLength = 0;
}
};
const char* textEnd = text.c_str() + text.length();
for (const char* pstr = text.c_str(); *pstr != '\0';) {
size_t remainingLength = textEnd - pstr;
const PrefixMatch::Match matched =
prefixMatch->MatchPrefix(pstr, remainingLength);
size_t matchedLength;
if (!matched.matched) {
matchedLength = UTF8Util::NextCharLength(pstr);
if (matchedLength > remainingLength) {
matchedLength = remainingLength;
}
segLength += matchedLength;
} else {
clearBuffer();
matchedLength = matched.keyLength;
segments->AddSegment(*matched.key);
segStart = pstr + matchedLength;
}
pstr += matchedLength;
}
clearBuffer();
return segments;
}