#include <string>
#include "logging.h"
#include "pod_array.h"
#include "regexp.h"
#include "utf.h"
#include "util.h"
#include "walker-inl.h"
namespace lbug {
namespace regex {
bool Regexp::SimplifyRegexp(
const StringPiece& src, ParseFlags flags, std::string* dst, RegexpStatus* status) {
Regexp* re = Parse(src, flags, status);
if (re == NULL)
return false;
Regexp* sre = re->Simplify();
re->Decref();
if (sre == NULL) {
if (status) {
status->set_code(kRegexpInternalError);
status->set_error_arg(src);
}
return false;
}
*dst = sre->ToString();
sre->Decref();
return true;
}
bool Regexp::ComputeSimple() {
Regexp** subs;
switch (op_) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpLiteral:
case kRegexpLiteralString:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpBeginText:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpEndText:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpHaveMatch:
return true;
case kRegexpConcat:
case kRegexpAlternate:
subs = sub();
for (int i = 0; i < nsub_; i++)
if (!subs[i]->simple())
return false;
return true;
case kRegexpCharClass:
if (ccb_ != NULL)
return !ccb_->empty() && !ccb_->full();
return !cc_->empty() && !cc_->full();
case kRegexpCapture:
subs = sub();
return subs[0]->simple();
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
subs = sub();
if (!subs[0]->simple())
return false;
switch (subs[0]->op_) {
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpEmptyMatch:
case kRegexpNoMatch:
return false;
default:
break;
}
return true;
case kRegexpRepeat:
return false;
}
LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
return false;
}
class CoalesceWalker : public Regexp::Walker<Regexp*> {
public:
CoalesceWalker() {}
virtual Regexp* PostVisit(
Regexp* re, Regexp* parent_arg, Regexp* pre_arg, Regexp** child_args, int nchild_args);
virtual Regexp* Copy(Regexp* re);
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
private:
static bool CanCoalesce(Regexp* r1, Regexp* r2);
static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
CoalesceWalker(const CoalesceWalker&) = delete;
CoalesceWalker& operator=(const CoalesceWalker&) = delete;
};
class SimplifyWalker : public Regexp::Walker<Regexp*> {
public:
SimplifyWalker() {}
virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
virtual Regexp* PostVisit(
Regexp* re, Regexp* parent_arg, Regexp* pre_arg, Regexp** child_args, int nchild_args);
virtual Regexp* Copy(Regexp* re);
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
private:
static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
static Regexp* SimplifyRepeat(Regexp* re, int min, int max, Regexp::ParseFlags parse_flags);
static Regexp* SimplifyCharClass(Regexp* re);
SimplifyWalker(const SimplifyWalker&) = delete;
SimplifyWalker& operator=(const SimplifyWalker&) = delete;
};
Regexp* Regexp::Simplify() {
CoalesceWalker cw;
Regexp* cre = cw.Walk(this, NULL);
if (cre == NULL)
return NULL;
if (cw.stopped_early()) {
cre->Decref();
return NULL;
}
SimplifyWalker sw;
Regexp* sre = sw.Walk(cre, NULL);
cre->Decref();
if (sre == NULL)
return NULL;
if (sw.stopped_early()) {
sre->Decref();
return NULL;
}
return sre;
}
#define Simplify DontCallSimplify
static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
for (int i = 0; i < re->nsub(); i++) {
Regexp* sub = re->sub()[i];
Regexp* newsub = child_args[i];
if (newsub != sub)
return true;
}
for (int i = 0; i < re->nsub(); i++) {
Regexp* newsub = child_args[i];
newsub->Decref();
}
return false;
}
Regexp* CoalesceWalker::Copy(Regexp* re) {
return re->Incref();
}
Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
#endif
return re->Incref();
}
Regexp* CoalesceWalker::PostVisit(
Regexp* re, Regexp* parent_arg, Regexp* pre_arg, Regexp** child_args, int nchild_args) {
if (re->nsub() == 0)
return re->Incref();
if (re->op() != kRegexpConcat) {
if (!ChildArgsChanged(re, child_args))
return re->Incref();
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub());
Regexp** nre_subs = nre->sub();
for (int i = 0; i < re->nsub(); i++)
nre_subs[i] = child_args[i];
if (re->op() == kRegexpRepeat) {
nre->min_ = re->min();
nre->max_ = re->max();
} else if (re->op() == kRegexpCapture) {
nre->cap_ = re->cap();
}
return nre;
}
bool can_coalesce = false;
for (int i = 0; i < re->nsub(); i++) {
if (i + 1 < re->nsub() && CanCoalesce(child_args[i], child_args[i + 1])) {
can_coalesce = true;
break;
}
}
if (!can_coalesce) {
if (!ChildArgsChanged(re, child_args))
return re->Incref();
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub());
Regexp** nre_subs = nre->sub();
for (int i = 0; i < re->nsub(); i++)
nre_subs[i] = child_args[i];
return nre;
}
for (int i = 0; i < re->nsub(); i++) {
if (i + 1 < re->nsub() && CanCoalesce(child_args[i], child_args[i + 1]))
DoCoalesce(&child_args[i], &child_args[i + 1]);
}
int n = 0;
for (int i = n; i < re->nsub(); i++) {
if (child_args[i]->op() == kRegexpEmptyMatch)
n++;
}
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub() - n);
Regexp** nre_subs = nre->sub();
for (int i = 0, j = 0; i < re->nsub(); i++) {
if (child_args[i]->op() == kRegexpEmptyMatch) {
child_args[i]->Decref();
continue;
}
nre_subs[j] = child_args[i];
j++;
}
return nre;
}
bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
if ((r1->op() == kRegexpStar || r1->op() == kRegexpPlus || r1->op() == kRegexpQuest ||
r1->op() == kRegexpRepeat) &&
(r1->sub()[0]->op() == kRegexpLiteral || r1->sub()[0]->op() == kRegexpCharClass ||
r1->sub()[0]->op() == kRegexpAnyChar || r1->sub()[0]->op() == kRegexpAnyByte)) {
if ((r2->op() == kRegexpStar || r2->op() == kRegexpPlus || r2->op() == kRegexpQuest ||
r2->op() == kRegexpRepeat) &&
Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
((r1->parse_flags() & Regexp::NonGreedy) == (r2->parse_flags() & Regexp::NonGreedy))) {
return true;
}
if (Regexp::Equal(r1->sub()[0], r2)) {
return true;
}
if (r1->sub()[0]->op() == kRegexpLiteral && r2->op() == kRegexpLiteralString &&
r2->runes()[0] == r1->sub()[0]->rune() &&
((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
(r2->parse_flags() & Regexp::FoldCase))) {
return true;
}
}
return false;
}
void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
Regexp* r1 = *r1ptr;
Regexp* r2 = *r2ptr;
Regexp* nre = Regexp::Repeat(r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
switch (r1->op()) {
case kRegexpStar:
nre->min_ = 0;
nre->max_ = -1;
break;
case kRegexpPlus:
nre->min_ = 1;
nre->max_ = -1;
break;
case kRegexpQuest:
nre->min_ = 0;
nre->max_ = 1;
break;
case kRegexpRepeat:
nre->min_ = r1->min();
nre->max_ = r1->max();
break;
default:
nre->Decref();
LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
return;
}
switch (r2->op()) {
case kRegexpStar:
nre->max_ = -1;
goto LeaveEmpty;
case kRegexpPlus:
nre->min_++;
nre->max_ = -1;
goto LeaveEmpty;
case kRegexpQuest:
if (nre->max() != -1)
nre->max_++;
goto LeaveEmpty;
case kRegexpRepeat:
nre->min_ += r2->min();
if (r2->max() == -1)
nre->max_ = -1;
else if (nre->max() != -1)
nre->max_ += r2->max();
goto LeaveEmpty;
case kRegexpLiteral:
case kRegexpCharClass:
case kRegexpAnyChar:
case kRegexpAnyByte:
nre->min_++;
if (nre->max() != -1)
nre->max_++;
goto LeaveEmpty;
LeaveEmpty:
*r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
*r2ptr = nre;
break;
case kRegexpLiteralString: {
Rune r = r1->sub()[0]->rune();
int n = 1;
while (n < r2->nrunes() && r2->runes()[n] == r)
n++;
nre->min_ += n;
if (nre->max() != -1)
nre->max_ += n;
if (n == r2->nrunes())
goto LeaveEmpty;
*r1ptr = nre;
*r2ptr = Regexp::LiteralString(&r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
break;
}
default:
nre->Decref();
LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
return;
}
r1->Decref();
r2->Decref();
}
Regexp* SimplifyWalker::Copy(Regexp* re) {
return re->Incref();
}
Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
#endif
return re->Incref();
}
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
if (re->simple()) {
*stop = true;
return re->Incref();
}
return NULL;
}
Regexp* SimplifyWalker::PostVisit(
Regexp* re, Regexp* parent_arg, Regexp* pre_arg, Regexp** child_args, int nchild_args) {
switch (re->op()) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpLiteral:
case kRegexpLiteralString:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpBeginText:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpEndText:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpHaveMatch:
re->simple_ = true;
return re->Incref();
case kRegexpConcat:
case kRegexpAlternate: {
if (!ChildArgsChanged(re, child_args)) {
re->simple_ = true;
return re->Incref();
}
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub());
Regexp** nre_subs = nre->sub();
for (int i = 0; i < re->nsub(); i++)
nre_subs[i] = child_args[i];
nre->simple_ = true;
return nre;
}
case kRegexpCapture: {
Regexp* newsub = child_args[0];
if (newsub == re->sub()[0]) {
newsub->Decref();
re->simple_ = true;
return re->Incref();
}
Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
nre->AllocSub(1);
nre->sub()[0] = newsub;
nre->cap_ = re->cap();
nre->simple_ = true;
return nre;
}
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest: {
Regexp* newsub = child_args[0];
if (newsub->op() == kRegexpEmptyMatch)
return newsub;
if (newsub == re->sub()[0]) {
newsub->Decref();
re->simple_ = true;
return re->Incref();
}
if (re->op() == newsub->op() && re->parse_flags() == newsub->parse_flags())
return newsub;
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(1);
nre->sub()[0] = newsub;
nre->simple_ = true;
return nre;
}
case kRegexpRepeat: {
Regexp* newsub = child_args[0];
if (newsub->op() == kRegexpEmptyMatch)
return newsub;
Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_, re->parse_flags());
newsub->Decref();
nre->simple_ = true;
return nre;
}
case kRegexpCharClass: {
Regexp* nre = SimplifyCharClass(re);
nre->simple_ = true;
return nre;
}
}
LOG(ERROR) << "Simplify case not handled: " << re->op();
return re->Incref();
}
Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags parse_flags) {
Regexp* re = new Regexp(kRegexpConcat, parse_flags);
re->AllocSub(2);
Regexp** subs = re->sub();
subs[0] = re1;
subs[1] = re2;
return re;
}
Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max, Regexp::ParseFlags f) {
if (max == -1) {
if (min == 0)
return Regexp::Star(re->Incref(), f);
if (min == 1)
return Regexp::Plus(re->Incref(), f);
PODArray<Regexp*> nre_subs(min);
for (int i = 0; i < min - 1; i++)
nre_subs[i] = re->Incref();
nre_subs[min - 1] = Regexp::Plus(re->Incref(), f);
return Regexp::Concat(nre_subs.data(), min, f);
}
if (min == 0 && max == 0)
return new Regexp(kRegexpEmptyMatch, f);
if (min == 1 && max == 1)
return re->Incref();
Regexp* nre = NULL;
if (min > 0) {
PODArray<Regexp*> nre_subs(min);
for (int i = 0; i < min; i++)
nre_subs[i] = re->Incref();
nre = Regexp::Concat(nre_subs.data(), min, f);
}
if (max > min) {
Regexp* suf = Regexp::Quest(re->Incref(), f);
for (int i = min + 1; i < max; i++)
suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
if (nre == NULL)
nre = suf;
else
nre = Concat2(nre, suf, f);
}
if (nre == NULL) {
LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
return new Regexp(kRegexpNoMatch, f);
}
return nre;
}
Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
CharClass* cc = re->cc();
if (cc->empty())
return new Regexp(kRegexpNoMatch, re->parse_flags());
if (cc->full())
return new Regexp(kRegexpAnyChar, re->parse_flags());
return re->Incref();
}
} }