#include <openbabel/babelconfig.h>
#include <sstream>
#include <iostream>
#include <fstream>
#include <openbabel/mol.h>
#include <openbabel/obconversion.h>
#include <openbabel/fingerprint.h>
#include <openbabel/op.h>
#include <openbabel/elements.h>
#include <openbabel/bond.h>
#include <openbabel/obutil.h>
#include <cstdlib>
#include <algorithm>
using namespace std;
namespace OpenBabel {
class FastSearchFormat : public OBFormat
{
public:
FastSearchFormat() : fsi(nullptr)
{
OBConversion::RegisterFormat("fs",this);
OBConversion::RegisterOptionParam("S", this, 1, OBConversion::GENOPTIONS);
OBConversion::RegisterOptionParam("S", this, 1, OBConversion::INOPTIONS);
OBConversion::RegisterOptionParam("f", this, 1);
OBConversion::RegisterOptionParam("N", this, 1);
OBConversion::RegisterOptionParam("u", this, 0);
OBConversion::RegisterOptionParam("t", this, 1, OBConversion::INOPTIONS);
OBConversion::RegisterOptionParam("l", this, 1, OBConversion::INOPTIONS);
OBConversion::RegisterOptionParam("a", this, 0, OBConversion::INOPTIONS);
OBConversion::RegisterOptionParam("e", this, 0, OBConversion::INOPTIONS);
}
virtual const char* Description() { return
"Fastsearch format\n"
"Fingerprint-aided substructure and similarity searching\n\n"
"Writing to the fs format makes an index of a multi-molecule datafile::\n\n"
" obabel dataset.sdf -ofs\n\n"
"This prepares an index :file:`dataset.fs` with default parameters, and is slow\n"
"(~30 minutes for a 250,000 molecule file).\n\n"
"However, when reading from the fs format searches are much faster, a few seconds,\n"
"and so can be done interactively.\n\n"
"The search target is the parameter of the ``-s`` option and can be\n"
"slightly extended SMILES (with ``[#n]`` atoms and ``~`` bonds) or\n"
"the name of a file containing a molecule.\n\n"
"Several types of searches are possible:\n\n"
"- Identical molecule::\n\n"
" obabel index.fs -O outfile.yyy -s SMILES exact\n\n"
"- Substructure::\n\n"
" obabel index.fs -O outfile.yyy -s SMILES or\n"
" obabel index.fs -O outfile.yyy -s filename.xxx\n\n"
" where ``xxx`` is a format id known to OpenBabel, e.g. sdf\n"
"- Molecular similarity based on Tanimoto coefficient::\n\n"
" obabel index.fs -O outfile.yyy -at15 -sSMILES # best 15 molecules\n"
" obabel index.fs -O outfile.yyy -at0.7 -sSMILES # Tanimoto >0.7\n"
" obabel index.fs -O outfile.yyy -at0.7,0.9 -sSMILES\n"
" # Tanimoto >0.7 && Tanimoto < 0.9\n\n"
"The datafile plus the ``-ifs`` option can be used instead of the index file.\n\n"
"NOTE on 32-bit systems the datafile MUST NOT be larger than 4GB.\n\n"
"Dative bonds like -[N+][O-](=O) are indexed as -N(=O)(=O), and when searching\n"
"the target molecule should be in the second form.\n\n"
".. seealso::\n\n"
" :ref:`fingerprints`\n\n"
"Write Options (when making index) e.g. -xfFP3\n"
" f# Fingerprint type\n"
" If not specified, the default fingerprint (currently FP2) is used\n"
" N# Fold fingerprint to # bits\n"
" u Update an existing index\n\n"
"Read Options (when searching) e.g. -at0.7\n"
" t# Do similarity search:#mols or # as min Tanimoto\n"
" a Add Tanimoto coeff to title in similarity search\n"
" l# Maximum number of candidates. Default<4000>\n"
" e Exact match\n"
" Alternative to using exact in ``-s`` parameter, see above\n"
" n No further SMARTS filtering after fingerprint phase\n\n"
;
};
virtual unsigned int Flags(){return READBINARY | READONEONLY | WRITEBINARY;};
public:
virtual bool ReadChemObject(OBConversion* pConv);
virtual bool WriteChemObject(OBConversion* pConv);
private:
bool ObtainTarget(OBConversion* pConv, std::vector<OBMol>& patternMols, const std::string& indexname);
void AddPattern(vector<OBMol>& patternMols, OBMol patternMol, int idx);
private:
FastSearch fs;
FastSearchIndexer* fsi;
streampos LastSeekpos; OBStopwatch sw; int nmols; };
FastSearchFormat theFastSearchFormat;
bool FastSearchFormat::ReadChemObject(OBConversion* pConv)
{
std::string auditMsg = "OpenBabel::Read fastsearch index ";
std::string description(Description());
auditMsg += description.substr(0,description.find('\n'));
obErrorLog.ThrowError(__FUNCTION__,
auditMsg,
obAuditMsg);
string indexname = pConv->GetInFilename();
string::size_type pos=indexname.find_last_of('.');
if(pos!=string::npos)
{
indexname.erase(pos);
indexname += ".fs";
}
ifstream ifs;
stringstream errorMsg;
if(!indexname.empty())
ifs.open(indexname.c_str(),ios::binary);
if(!ifs)
{
errorMsg << "Couldn't open " << indexname << endl;
obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError);
return false;
}
string datafilename = fs.ReadIndex(&ifs);
if(datafilename.empty())
{
errorMsg << "Difficulty reading from index " << indexname << endl;
obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError);
return false;
}
vector<OBMol> patternMols;
if(!ObtainTarget(pConv, patternMols, indexname))
return false;
bool exactmatch = pConv->IsOption("e", OBConversion::INOPTIONS) != nullptr;
string path;
pos = indexname.find_last_of("/\\");
if(pos==string::npos)
path = datafilename;
else
path = indexname.substr(0,pos+1) + datafilename;
ifstream datastream(path.c_str());
if(!datastream)
{
errorMsg << "Difficulty opening " << path << endl;
obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError);
return false;
}
pConv->SetInStream(&datastream);
bool isgzip = false;
if(!pConv->SetInAndOutFormats(pConv->FormatFromExt(datafilename.c_str(), isgzip), pConv->GetOutFormat()))
return false;
if (isgzip)
{
errorMsg << "Index datafile must not be in gzip format: " << path << endl;
obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError);
return false;
}
vector<OBBase*> extraSMARTSMols;
vector<OBMol>extraUnchargedMols;
for(unsigned i=0;i<patternMols.size();++i)
{
if(patternMols[i].ConvertDativeBonds())
extraSMARTSMols.push_back(&patternMols[i]);
else
{
extraUnchargedMols.push_back(patternMols[i]);
if(extraUnchargedMols.back().MakeDativeBonds())
extraSMARTSMols.push_back(&extraUnchargedMols.back());
}
}
OBOp* sFilter = OBOp::FindType("s");
if(sFilter)
sFilter->ProcessVec(extraSMARTSMols);
const char* p = pConv->IsOption("t",OBConversion::INOPTIONS);
if(p)
{
multimap<double, unsigned long> SeekposMap;
string txt=p;
if(txt.find('.')==string::npos)
{
int n = atoi(p);
fs.FindSimilar(&patternMols[0], SeekposMap, n);
}
else
{
double MaxTani = 1.1;
size_t pos = txt.find(',');
if( pos != string::npos ) {
MaxTani = atof( txt.substr( pos + 1 ).c_str() );
}
double MinTani = atof( txt.substr( 0, pos ).c_str() );
fs.FindSimilar(&patternMols[0], SeekposMap, MinTani, MaxTani);
}
pConv->RemoveOption("s", OBConversion::GENOPTIONS);
pConv->RemoveOption("S", OBConversion::GENOPTIONS);
multimap<double, unsigned long>::reverse_iterator itr;
for(itr=SeekposMap.rbegin();itr!=SeekposMap.rend();++itr)
{
datastream.seekg(itr->second);
if(pConv->IsOption("a", OBConversion::INOPTIONS))
{
pConv->RemoveOption("addtotitle", OBConversion::GENOPTIONS);
stringstream ss;
ss << " " << itr->first;
pConv->AddOption("addtotitle",OBConversion::GENOPTIONS, ss.str().c_str());
}
pConv->SetOneObjectOnly();
if(itr != --SeekposMap.rend())
pConv->SetMoreFilesToCome(); pConv->Convert(nullptr, nullptr);
}
}
else
{
int MaxCandidates = 4000;
p = pConv->IsOption("l",OBConversion::INOPTIONS);
if(p && atoi(p))
MaxCandidates = atoi(p);
vector<unsigned long> SeekPositions;
if(exactmatch)
{
fs.FindMatch(&patternMols[0], SeekPositions, MaxCandidates);
stringstream ss;
ss << patternMols[0].NumHvyAtoms();
pConv->AddOption("exactmatch", OBConversion::GENOPTIONS, ss.str().c_str());
}
else
{
vector<OBMol>::iterator iter;
for(iter=patternMols.begin();iter!=patternMols.end();++iter)
fs.Find(&*iter, SeekPositions, MaxCandidates);
clog << SeekPositions.size() << " candidates from fingerprint search phase" << endl;
}
vector<unsigned long>::iterator seekitr,
begin = SeekPositions.begin(), end = SeekPositions.end();
if(patternMols.size()>1) {
sort(begin, end);
end = unique(begin, end); }
if(pConv->IsOption("n", OBConversion::INOPTIONS) )
pConv->RemoveOption("s",OBConversion::GENOPTIONS);
pConv->SetLast(false);
for(seekitr=begin; seekitr!=end; ++seekitr)
{
datastream.seekg(*seekitr);
if(!pConv->GetInFormat()->ReadChemObject(pConv))
return false;
pConv->SetFirstInput(false); }
}
return false; }
bool FastSearchFormat::WriteChemObject(OBConversion* pConv)
{
bool update = pConv->IsOption("u") != nullptr;
static ostream* pOs;
static bool NewOstreamUsed;
if (fsi == nullptr)
{
if(pConv->GetInGzipped())
{
obErrorLog.ThrowError(__FUNCTION__,
"Fastindex search requires an uncompressed input file so it can quickly seek to a record.",
obWarning);
}
pOs = pConv->GetOutStream(); NewOstreamUsed=false;
string mes("prepare an");
if(update)
mes = "update the";
clog << "This will " << mes << " index of " << pConv->GetInFilename()
<< " and may take some time..." << flush;
if(!pConv->IsLastFile())
{
obErrorLog.ThrowError(__FUNCTION__,
"There should not be multiple input files. A .fs file is an index of a single datafile.",
obError);
return false;
}
std::string auditMsg = "OpenBabel::Write fastsearch index ";
std::string description(Description());
auditMsg += description.substr( 0, description.find('\n') );
obErrorLog.ThrowError(__FUNCTION__,auditMsg,obAuditMsg);
FptIndex* pidx = nullptr;
if(!dynamic_cast<ofstream*>(pOs))
{
string indexname=pConv->GetInFilename();
string::size_type pos=indexname.find_last_of('.');
if(pos!=string::npos)
indexname.erase(pos);
indexname += ".fs";
bool idxok=true;
if(update)
{
LastSeekpos = 0;
idxok=false;
ifstream ifs(indexname.c_str(),ifstream::binary);
if(ifs.good())
{
pidx = new FptIndex;
idxok = pidx->Read(&ifs);
}
}
pOs = new ofstream(indexname.c_str(),ofstream::binary);
if(!pOs->good() || !idxok)
{
stringstream errorMsg;
errorMsg << "Trouble opening or reading " << indexname << endl;
obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError);
static_cast<ofstream *>(pOs)->close(); delete pOs;
delete pidx; return false;
}
NewOstreamUsed=true;
}
else {
if(update)
{
obErrorLog.ThrowError(__FUNCTION__,
"Currently, updating is only done on index files that"
"have the same name as the datafile.\n"
"Do not specify an output file; use the form:\n"
" obabel datafile.xxx -ofs -xu", obError);
return false;
}
}
int nbits = 0;
const char* p = pConv->IsOption("N");
if(p)
nbits = atoi(p);
string fpid; p=pConv->IsOption("f");
if(p)
fpid=p;
string datafilename = pConv->GetInFilename();
if(datafilename.empty())
{
obErrorLog.ThrowError(__FUNCTION__, "No datafile!", obError);
delete pidx;
return false;
}
string::size_type pos = datafilename.find_last_of("/\\");
if(pos!=string::npos)
datafilename=datafilename.substr(pos+1);
nmols = pConv->NumInputObjects();
if(nmols>0)
clog << "\nIt contains " << nmols << " molecules" << flush;
if(nmols>500000)
{
istream* is = pConv->GetInStream();
streampos origpos = is->tellg();
is->seekg(0,ios_base::end);
long long filesize = is->tellg();
if(sizeof(void*) < 8 && filesize > 4294967295u)
{
obErrorLog.ThrowError(__FUNCTION__, "The datafile must not be larger than 4GB", obError);
return false;
}
is->seekg(origpos);
}
sw.Start();
if(update)
{
fsi = new FastSearchIndexer(pidx, pOs, nmols);
LastSeekpos = *(pidx->seekdata.end()-1);
pConv->GetInStream()->seekg(LastSeekpos);
}
else
fsi = new FastSearchIndexer(datafilename, pOs, fpid, nbits, nmols);
obErrorLog.StopLogging();
}
OBBase* pOb = pConv->GetChemObject();
OBMol* pmol = dynamic_cast<OBMol*> (pOb);
if(pmol)
pmol->ConvertDativeBonds();
streampos seekpos = pConv->GetInPos();
if(!update || seekpos>LastSeekpos)
{
fsi->Add(pOb, seekpos );
if(pConv->GetOutputIndex()==400 && nmols>1000)
{
clog << " Estimated completion time ";
double secs = sw.Elapsed() * nmols / 400; if(secs>150)
clog << secs/60 << " minutes" << endl;
else
clog << secs << " seconds" << endl;
}
}
else
pConv->SetOutputIndex(pConv->GetOutputIndex()-1);
if(pConv->IsLast())
{
delete fsi; if(NewOstreamUsed)
delete pOs;
fsi=nullptr;
obErrorLog.StartLogging();
double secs = sw.Elapsed();
if(secs>150)
clog << "\n It took " << secs/60 << " minutes" << endl;
else
clog << "\n It took " << secs << " seconds" << endl;
}
delete pOb;
return true;
}
bool FastSearchFormat::ObtainTarget(OBConversion* pConv, vector<OBMol>& patternMols, const string& indexname)
{
OBMol patternMol;
patternMol.SetIsPatternStructure();
const char* p = pConv->IsOption("s",OBConversion::GENOPTIONS);
bool OldSOption=false;
if(!p)
{
p = pConv->IsOption("S",OBConversion::GENOPTIONS);
if(!p)
p = pConv->IsOption("S",OBConversion::INOPTIONS); OldSOption = true;
}
if(p)
{
vector<string> vec;
tokenize(vec, p);
if(vec.size() == 0)
{
obErrorLog.ThrowError(__FUNCTION__,
"Missing argument for -s/-S", obError);
return false;
}
if(vec[0][0]=='~')
vec[0].erase(0,1);
if(vec.size()>1 && vec[1]=="exact")
pConv->AddOption("e", OBConversion::INOPTIONS);
OBConversion patternConv;
OBFormat* pFormat;
string& txt =vec [0];
if( txt.empty() ||
txt.find('.')==string::npos ||
!(pFormat = patternConv.FormatFromExt(txt.c_str())) ||
!patternConv.SetInFormat(pFormat) ||
!patternConv.ReadFile(&patternMol, txt) ||
patternMol.NumAtoms()==0)
{
for(;;)
{
string::size_type pos1, pos2;
pos1 = txt.find("[#");
if(pos1==string::npos)
break;
pos2 = txt.find(']');
int atno;
if(pos2!=string::npos && (atno = atoi(txt.substr(pos1+2, pos2-pos1-2).c_str())) && atno>0)
txt.replace(pos1, pos2-pos1+1, OBElements::GetSymbol(atno));
else
{
obErrorLog.ThrowError(__FUNCTION__,"Ill-formed [#n] atom in SMARTS", obError);
return false;
}
}
bool hasTildeBond;
if( (hasTildeBond = (txt.find('~')!=string::npos)) ) {
if(txt.find('$')!=string::npos)
{
obErrorLog.ThrowError(__FUNCTION__,
"Cannot use ~ bonds in patterns with $ (quadruple) bonds.)", obError);
return false;
}
replace(txt.begin(),txt.end(), '~' , '$');
}
patternConv.SetInFormat("smi");
if(!patternConv.ReadString(&patternMol, vec[0]))
{
obErrorLog.ThrowError(__FUNCTION__,"Cannot read the SMILES string",obError);
return false;
}
if(hasTildeBond)
{
AddPattern(patternMols, patternMol, 0); return true;
}
}
else
{
patternMols.push_back(patternMol);
while(patternConv.Read(&patternMol))
patternMols.push_back(patternMol);
return true;
}
}
if(OldSOption) {
OBConversion conv;
if(conv.SetOutFormat("smi"))
{
string optiontext = conv.WriteString(&patternMol, true);
pConv->AddOption("s", OBConversion::GENOPTIONS, optiontext.c_str());
}
}
if(!p)
{
const FptIndexHeader& header = fs.GetIndexHeader();
string id(header.fpid);
if(id.empty())
id = "default";
clog << indexname << " is an index of\n " << header.datafilename
<< ".\n It contains " << header.nEntries
<< " molecules. The fingerprint type is " << id << " with "
<< OBFingerprint::Getbitsperint() * header.words << " bits.\n"
<< "Typical usage for a substructure search:\n"
<< "obabel indexfile.fs -osmi -sSMILES\n"
<< "(-s option in GUI is 'Convert only if match SMARTS or mols in file')" << endl;
return false;
}
patternMols.push_back(patternMol);
return true;
}
void FastSearchFormat::AddPattern(vector<OBMol>& patternMols, OBMol patternMol, int idx)
{
if(idx>=patternMol.NumBonds())
return;
if(patternMol.GetBond(idx)->GetBondOrder()==4)
{
patternMol.GetBond(idx)->SetBondOrder(1);
patternMols.push_back(patternMol);
AddPattern(patternMols, patternMol,idx+1);
patternMols.push_back(patternMol);
patternMols.back().GetBond(idx)->SetBondOrder(5);
}
AddPattern(patternMols, patternMol,idx+1);
}
}