#ifdef MAKE_ZIM_SUPPORT
#include "zim.hh"
#include "btreeidx.hh"
#include "fsencoding.hh"
#include "folding.hh"
#include "categorized_logging.hh"
#include "gddebug.hh"
#include "utf8.hh"
#include "decompress.hh"
#include "langcoder.hh"
#include "wstring_qt.hh"
#include "filetype.hh"
#include "file.hh"
#include "qt4x5.hh"
#include "tiff.hh"
#include "ftshelpers.hh"
#include "htmlescape.hh"
#include "splitfile.hh"
#ifdef _MSC_VER
#include <stub_msvc.h>
#endif
#include <QByteArray>
#include <QFile>
#include <QFileInfo>
#include <QString>
#include <QRunnable>
#include <QSemaphore>
#include <QAtomicInt>
#include <QImage>
#include <QDir>
#include <QDebug>
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
#include <QRegularExpression>
#endif
#include <string>
#include <set>
#include <map>
#include <algorithm>
namespace Zim {
#define CACHE_SIZE 3
using std::string;
using std::map;
using std::vector;
using std::multimap;
using std::pair;
using std::set;
using gd::wstring;
using BtreeIndexing::WordArticleLink;
using BtreeIndexing::IndexedWords;
using BtreeIndexing::IndexInfo;
DEF_EX_STR( exNotZimFile, "Not an Zim file", Dictionary::Ex )
DEF_EX_STR( exCantReadFile, "Can't read file", Dictionary::Ex )
DEF_EX_STR( exInvalidZimHeader, "Invalid Zim header", Dictionary::Ex )
DEF_EX( exUserAbort, "User abort", Dictionary::Ex )
class ZimFile;
#pragma pack( push, 1 )
enum CompressionType
{
Default = 0, None, Zlib, Bzip2, Lzma2, Zstd
};
struct ZIM_header
{
quint32 magicNumber;
quint16 majorVersion;
quint16 minorVersion;
quint8 uuid[ 16 ];
quint32 articleCount;
quint32 clusterCount;
quint64 urlPtrPos;
quint64 titlePtrPos;
quint64 clusterPtrPos;
quint64 mimeListPos;
quint32 mainPage;
quint32 layoutPage;
quint64 checksumPos;
}
#ifndef _MSC_VER
__attribute__((packed))
#endif
;
struct ArticleEntry
{
quint16 mimetype;
quint8 parameterLen;
char nameSpace;
quint32 revision;
quint32 clusterNumber;
quint32 blobNumber;
}
#ifndef _MSC_VER
__attribute__((packed))
#endif
;
struct RedirectEntry
{
quint16 mimetype;
quint8 parameterLen;
char nameSpace;
quint32 revision;
quint32 redirectIndex;
}
#ifndef _MSC_VER
__attribute__((packed))
#endif
;
enum
{
Signature = 0x584D495A, CurrentFormatVersion = 4 + BtreeIndexing::FormatVersion + Folding::Version
};
struct IdxHeader
{
quint32 signature; quint32 formatVersion; quint32 indexBtreeMaxElements; quint32 indexRootOffset;
quint32 resourceIndexBtreeMaxElements; quint32 resourceIndexRootOffset;
quint32 wordCount;
quint32 articleCount;
quint32 namePtr;
quint32 descriptionPtr;
quint32 langFrom; quint32 langTo; quint32 iconPtr;
}
#ifndef _MSC_VER
__attribute__((packed))
#endif
;
#pragma pack( pop )
struct Cache
{
char * data;
quint32 clusterNumber;
int stamp;
int count, size;
unsigned blobs_offset_size;
Cache() :
data( 0 ),
clusterNumber( 0 ),
stamp( -1 ),
count( 0 ),
size( 0 ),
blobs_offset_size( 0 )
{}
};
class ZimFile : public SplitFile::SplitFile
{
public:
ZimFile();
ZimFile( const QString & name );
~ZimFile();
virtual void setFileName( const QString & name );
bool open();
void close()
{
SplitFile::close();
clearCache();
}
const ZIM_header & header() const
{ return zimHeader; }
string getClusterData( quint32 cluster_nom, unsigned & blob_offset_size );
const QString getMimeType( quint16 nom )
{ return mimeTypes.value( nom ); }
bool isArticleMime( quint16 mime_type )
{ return getMimeType( mime_type ).startsWith( "text/html", Qt::CaseInsensitive )
|| getMimeType( mime_type ).startsWith( "text/plain", Qt::CaseInsensitive ); }
quint16 redirectedMimeType( RedirectEntry const & redEntry );
private:
ZIM_header zimHeader;
Cache cache[ CACHE_SIZE ];
int stamp;
QVector< QPair< quint64, quint32 > > clusterOffsets;
QStringList mimeTypes;
void clearCache();
};
ZimFile::ZimFile() :
stamp( 0 )
{
memset( &zimHeader, 0, sizeof( zimHeader ) );
}
ZimFile::ZimFile( const QString & name )
{
setFileName( name );
}
ZimFile::~ZimFile()
{
clearCache();
}
void ZimFile::setFileName( const QString & name )
{
close();
memset( &zimHeader, 0, sizeof( zimHeader ) );
clearCache();
appendFile( name );
if( name.endsWith( ".zimaa", Qt::CaseInsensitive ) )
{
QString fname = name;
for( int i = 0; i < 26; i++ )
{
fname[ fname.size() - 2 ] = (char)( 'a' + i );
int j;
for( j = 1; j < 26; j++ )
{
fname[ fname.size() - 1 ] = (char)( 'a' + j );
if( !QFileInfo( fname ).isFile() )
break;
appendFile( fname );
}
if( j < 26 )
break;
}
}
}
void ZimFile::clearCache()
{
for( int i = 0; i < CACHE_SIZE; i++ )
{
if( cache[ i ].data )
{
free( cache[ i ].data );
cache[ i ].data = 0;
}
cache[ i ].clusterNumber = 0;
cache[ i ].stamp = -1;
cache[ i ].count = 0;
cache[ i ].size = 0;
}
stamp = 0;
}
bool ZimFile::open()
{
if( !SplitFile::open( QIODevice::ReadOnly ) )
return false;
memset( &zimHeader, 0, sizeof( zimHeader ) );
if( read( reinterpret_cast< char * >( &zimHeader ), sizeof( zimHeader ) ) != sizeof( zimHeader ) )
return false;
if( zimHeader.magicNumber != 0x44D495A || zimHeader.mimeListPos != sizeof( zimHeader ) )
return false;
clusterOffsets.resize( zimHeader.clusterCount );
QVector< quint64 > offs;
offs.resize( zimHeader.clusterCount );
seek( zimHeader.clusterPtrPos );
qint64 size = zimHeader.clusterCount * sizeof( quint64 );
if( read( reinterpret_cast< char * >( offs.data() ), size) != size )
{
vector< string > names;
getFilenames( names );
throw exCantReadFile( names[ 0 ] );
}
for( quint32 i = 0; i < zimHeader.clusterCount; i++ )
clusterOffsets[ i ] = QPair< quint64, quint32 >( offs.at( i ), i );
std::sort( clusterOffsets.begin(), clusterOffsets.end() );
string type;
char ch;
seek( zimHeader.mimeListPos );
for( ; ; )
{
type.clear();
while( getChar( &ch ) )
{
if( ch == 0 )
break;
type.push_back( ch );
}
if( type.empty() )
break;
QString s = QString::fromUtf8( type.c_str(), type.size() );
mimeTypes.append( s );
}
return true;
}
string ZimFile::getClusterData( quint32 cluster_nom, unsigned & blobs_offset_size )
{
int target = 0;
bool found = false;
int lastStamp = INT_MAX;
for( int i = 0; i < CACHE_SIZE; i++ )
{
if( cache[ i ].clusterNumber == cluster_nom && cache[ i ].count )
{
found = true;
target = i;
break;
}
if( cache[ i ].stamp < lastStamp )
{
lastStamp = cache[ i ].stamp;
target = i;
}
}
cache[ target ].stamp = ++stamp;
if( stamp < 0 )
{
stamp = 0;
for (int i = 0; i < CACHE_SIZE; i++)
cache[ i ].stamp = -1;
}
if( found )
{
blobs_offset_size = cache[ target ].blobs_offset_size;
return string( cache[ target ].data, cache[ target ].count );
}
quint64 clusterSize;
quint32 nom;
for( nom = 0; nom < zimHeader.clusterCount; nom++ )
if( clusterOffsets.at( nom ).second == cluster_nom )
break;
if( nom >= zimHeader.clusterCount ) return string();
if( nom < zimHeader.clusterCount - 1 )
clusterSize = clusterOffsets.at( nom + 1 ).first - clusterOffsets.at( nom ).first;
else
clusterSize = size() - clusterOffsets.at( nom ).first;
seek( clusterOffsets.at( nom ).first );
char compressionType, cluster_info;
if( !getChar( &cluster_info ) )
return string();
compressionType = cluster_info & 0x0F;
blobs_offset_size = cluster_info & 0x10 && zimHeader.majorVersion >= 6 ? 8 : 4;
string decompressedData;
QByteArray data = read( clusterSize );
if( compressionType == Default || compressionType == None )
decompressedData = string( data.data(), data.size() );
else
if( compressionType == Zlib )
decompressedData = decompressZlib( data.constData(), data.size() );
else
if( compressionType == Bzip2 )
decompressedData = decompressBzip2( data.constData(), data.size() );
else
if( compressionType == Lzma2 )
decompressedData = decompressLzma2( data.constData(), data.size() );
else
if( compressionType == Zstd )
decompressedData = decompressZstd( data.constData(), data.size() );
else
return string();
if( decompressedData.empty() )
return string();
quint32 firstOffset32;
quint64 firstOffset;
if( blobs_offset_size == 8 )
memcpy( &firstOffset, decompressedData.data(), sizeof(firstOffset) );
else
{
memcpy( &firstOffset32, decompressedData.data(), sizeof(firstOffset32) );
firstOffset = firstOffset32;
}
quint32 blobCount = ( firstOffset - blobs_offset_size ) / blobs_offset_size;
if( blobCount > 1 )
{
int size = decompressedData.size();
if( cache[ target ].count < size )
{
if( cache[ target ].data )
free( cache[ target ].data );
cache[ target ].data = ( char * )malloc( size );
if( cache[ target ].data )
cache[ target ].size = size;
else
{
cache[ target ].size = 0;
cache[ target ].count = 0;
}
}
if( cache[ target ].size )
{
memcpy( cache[ target ].data, decompressedData.c_str(), size );
cache[ target ].count = size;
cache[ target ].clusterNumber = cluster_nom;
cache[ target ].blobs_offset_size = blobs_offset_size;
}
}
return decompressedData;
}
quint16 ZimFile::redirectedMimeType( RedirectEntry const & redEntry )
{
RedirectEntry current_entry = redEntry;
quint64 current_pos = pos();
quint16 mimetype = 0xFFFF;
for( ; ; )
{
quint32 current_nom = current_entry.redirectIndex;
seek( zimHeader.urlPtrPos + (quint64)current_nom * 8 );
quint64 new_pos;
if( read( reinterpret_cast< char * >( &new_pos ), sizeof(new_pos) ) != sizeof(new_pos) )
break;
seek( new_pos );
quint16 new_mimetype;
if( read( reinterpret_cast< char * >( &new_mimetype ), sizeof(new_mimetype) ) != sizeof(new_mimetype) )
break;
if( new_mimetype == 0xFFFF ) {
if( read( reinterpret_cast< char * >( ¤t_entry ) + 2, sizeof( current_entry ) - 2 ) != sizeof( current_entry ) - 2 )
break;
if( current_nom == current_entry.redirectIndex )
break;
}
else
{
mimetype = new_mimetype;
break;
}
}
seek( current_pos );
return mimetype;
}
bool indexIsOldOrBad( string const & indexFile )
{
File::Class idx( indexFile, "rb" );
IdxHeader header;
return idx.readRecords( &header, sizeof( header ), 1 ) != 1 ||
header.signature != Signature ||
header.formatVersion != CurrentFormatVersion;
}
quint32 getArticleCluster( ZimFile & file, quint32 articleNumber )
{
while( 1 )
{
ZIM_header const & header = file.header();
if( articleNumber >= header.articleCount )
break;
file.seek( header.urlPtrPos + (quint64)articleNumber * 8 );
quint64 pos;
if( file.read( reinterpret_cast< char * >( &pos ), sizeof(pos) ) != sizeof(pos) )
break;
quint16 mimetype;
file.seek( pos );
if( file.read( reinterpret_cast< char * >( &mimetype ), sizeof(mimetype) ) != sizeof(mimetype) )
break;
if( mimetype == 0xFFFF ) {
RedirectEntry redEntry;
if( file.read( reinterpret_cast< char * >( &redEntry ) + 2, sizeof(redEntry) - 2 ) != sizeof(redEntry) - 2 )
break;
if( articleNumber == redEntry.redirectIndex )
break;
articleNumber = redEntry.redirectIndex;
continue;
}
ArticleEntry artEntry;
artEntry.mimetype = mimetype;
if( file.read( reinterpret_cast< char * >( &artEntry ) + 2, sizeof(artEntry) - 2 ) != sizeof(artEntry) - 2 )
break;
return artEntry.clusterNumber;
}
return 0xFFFFFFFF;
}
quint32 readArticle( ZimFile & file, quint32 articleNumber, string & result,
set< quint32 > * loadedArticles = NULL )
{
result.clear();
while( 1 )
{
ZIM_header const & header = file.header();
if( articleNumber >= header.articleCount )
break;
file.seek( header.urlPtrPos + (quint64)articleNumber * 8 );
quint64 pos;
if( file.read( reinterpret_cast< char * >( &pos ), sizeof(pos) ) != sizeof(pos) )
break;
quint16 mimetype;
file.seek( pos );
if( file.read( reinterpret_cast< char * >( &mimetype ), sizeof(mimetype) ) != sizeof(mimetype) )
break;
if( mimetype == 0xFFFF ) {
RedirectEntry redEntry;
if( file.read( reinterpret_cast< char * >( &redEntry ) + 2, sizeof(redEntry) - 2 ) != sizeof(redEntry) - 2 )
break;
if( articleNumber == redEntry.redirectIndex )
break;
articleNumber = redEntry.redirectIndex;
continue;
}
if( loadedArticles && loadedArticles->find( articleNumber ) != loadedArticles->end() )
break;
ArticleEntry artEntry;
artEntry.mimetype = mimetype;
if( file.read( reinterpret_cast< char * >( &artEntry ) + 2, sizeof(artEntry) - 2 ) != sizeof(artEntry) - 2 )
break;
unsigned offset_size = 0;
string decompressedData = file.getClusterData( artEntry.clusterNumber, offset_size );
if( decompressedData.empty() )
break;
quint32 firstOffset32;
quint64 firstOffset;
if( offset_size == 8 )
memcpy( &firstOffset, decompressedData.data(), sizeof(firstOffset) );
else
{
memcpy( &firstOffset32, decompressedData.data(), sizeof(firstOffset32) );
firstOffset = firstOffset32;
}
quint32 blobCount = ( firstOffset - offset_size ) / offset_size;
if( artEntry.blobNumber > blobCount )
break;
quint32 size;
if( offset_size == 8 )
{
quint64 offsets[ 2 ];
memcpy( offsets, decompressedData.data() + artEntry.blobNumber * 8, sizeof(offsets) );
size = offsets[ 1 ] - offsets[ 0 ];
result.append( decompressedData, offsets[ 0 ], size );
}
else
{
quint32 offsets[ 2 ];
memcpy( offsets, decompressedData.data() + artEntry.blobNumber * 4, sizeof(offsets) );
size = offsets[ 1 ] - offsets[ 0 ];
result.append( decompressedData, offsets[ 0 ], size );
}
return articleNumber;
}
return 0xFFFFFFFF;
}
class ZimDictionary: public BtreeIndexing::BtreeDictionary
{
enum LINKS_TYPE { UNKNOWN, SLASH, NO_SLASH };
Mutex idxMutex;
Mutex zimMutex, idxResourceMutex;
File::Class idx;
BtreeIndex resourceIndex;
IdxHeader idxHeader;
string dictionaryName;
ZimFile df;
set< quint32 > articlesIndexedForFTS;
LINKS_TYPE linksType;
public:
ZimDictionary( string const & id, string const & indexFile,
vector< string > const & dictionaryFiles );
~ZimDictionary();
virtual string getName() throw()
{ return dictionaryName; }
virtual map< Dictionary::Property, string > getProperties() throw()
{ return map< Dictionary::Property, string >(); }
virtual unsigned long getArticleCount() throw()
{ return idxHeader.articleCount; }
virtual unsigned long getWordCount() throw()
{ return idxHeader.wordCount; }
inline virtual quint32 getLangFrom() const
{ return idxHeader.langFrom; }
inline virtual quint32 getLangTo() const
{ return idxHeader.langTo; }
virtual sptr< Dictionary::DataRequest > getArticle( wstring const &,
vector< wstring > const & alts,
wstring const &,
bool ignoreDiacritics )
THROW_SPEC( std::exception );
virtual sptr< Dictionary::DataRequest > getResource( string const & name )
THROW_SPEC( std::exception );
virtual QString const& getDescription();
void loadResource( std::string &resourceName, string & data );
virtual sptr< Dictionary::DataRequest > getSearchResults( QString const & searchString,
int searchMode, bool matchCase,
int distanceBetweenWords,
int maxResults,
bool ignoreWordsOrder,
bool ignoreDiacritics,
QThreadPool * ftsThreadPoolPtr );
virtual void getArticleText( uint32_t articleAddress, QString & headword, QString & text );
quint32 getArticleText( uint32_t articleAddress, QString & headword, QString & text,
set< quint32 > * loadedArticles );
virtual void makeFTSIndex(QAtomicInt & isCancelled, bool firstIteration );
virtual void setFTSParameters( Config::FullTextSearch const & fts )
{
can_FTS = fts.enabled
&& !fts.disabledTypes.contains( "ZIM", Qt::CaseInsensitive )
&& ( fts.maxDictionarySize == 0 || getArticleCount() <= fts.maxDictionarySize );
}
virtual void sortArticlesOffsetsForFTS( QVector< uint32_t > & offsets, QAtomicInt & isCancelled );
protected:
virtual void loadIcon() throw();
private:
quint32 loadArticle( quint32 address,
string & articleText,
set< quint32 > * loadedArticles,
bool rawText = false );
string convert( string const & in_data );
friend class ZimArticleRequest;
friend class ZimResourceRequest;
};
ZimDictionary::ZimDictionary( string const & id,
string const & indexFile,
vector< string > const & dictionaryFiles ):
BtreeDictionary( id, dictionaryFiles ),
idx( indexFile, "rb" ),
idxHeader( idx.read< IdxHeader >() ),
df( FsEncoding::decode( dictionaryFiles[ 0 ].c_str() ) ),
linksType( UNKNOWN )
{
df.open();
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
idxHeader.indexRootOffset ),
idx, idxMutex );
resourceIndex.openIndex( IndexInfo( idxHeader.resourceIndexBtreeMaxElements,
idxHeader.resourceIndexRootOffset ),
idx, idxResourceMutex );
if( idxHeader.namePtr == 0xFFFFFFFF )
{
QString name = QDir::fromNativeSeparators( FsEncoding::decode( dictionaryFiles[ 0 ].c_str() ) );
int n = name.lastIndexOf( '/' );
dictionaryName = string( name.mid( n + 1 ).toUtf8().constData() );
}
else
{
readArticle( df, idxHeader.namePtr, dictionaryName );
}
can_FTS = true;
ftsIdxName = indexFile + "_FTS";
if( !Dictionary::needToRebuildIndex( dictionaryFiles, ftsIdxName )
&& !FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) )
FTS_index_completed.ref();
}
ZimDictionary::~ZimDictionary()
{
df.close();
}
void ZimDictionary::loadIcon() throw()
{
if ( dictionaryIconLoaded )
return;
QString fileName =
QDir::fromNativeSeparators( FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() ) );
fileName.chop( 3 );
if( !loadIconFromFile( fileName ) )
{
if( idxHeader.iconPtr != 0xFFFFFFFF )
{
string pngImage;
readArticle( df, idxHeader.iconPtr, pngImage );
QImage img = QImage::fromData( reinterpret_cast< const uchar *>( pngImage.data() ), pngImage.size() );
img.setAlphaChannel( img.createMaskFromColor( QColor( 192, 192, 192 ).rgb(),
Qt::MaskOutColor ) );
dictionaryNativeIcon = dictionaryIcon = QIcon( QPixmap::fromImage( img ) );
if( dictionaryIcon.isNull() )
dictionaryNativeIcon = dictionaryIcon = QIcon(":/icons/icon32_zim.png");
}
else
dictionaryNativeIcon = dictionaryIcon = QIcon(":/icons/icon32_zim.png");
}
dictionaryIconLoaded = true;
}
quint32 ZimDictionary::loadArticle( quint32 address,
string & articleText,
set< quint32 > * loadedArticles,
bool rawText )
{
quint32 ret;
{
Mutex::Lock _( zimMutex );
ret = readArticle( df, address, articleText, loadedArticles );
}
if( !rawText )
articleText = convert( articleText );
return ret;
}
string ZimDictionary::convert( const string & in )
{
QString text = QString::fromUtf8( in.c_str() );
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
text.replace( QRegularExpression( "<\\s*body\\s+([^>]*)(background(|-color)):([^;\"]*(;|))" ),
QString( "<body \\1" ) );
text.replace( QRegularExpression( "<\\s*(img|script)\\s+([^>]*)src=(\"|)(\\.\\.|)/" ),
QString( "<\\1 \\2src=\\3bres://%1/").arg( getId().c_str() ) );
text.replace( QRegularExpression( "href=(\\.\\.|)/([^\\s>]+)" ),
QString( "href=\"\\1/\\2\"" ) );
text.replace( QRegularExpression( "<\\s*link\\s+([^>]*)href=\"(\\.\\.|)/" ),
QString( "<link \\1href=\"bres://%1/").arg( getId().c_str() ) );
QString urlWiki = "\"http(s|)://en\\.(wiki(pedia|books|news|quote|source|voyage|versity)|wiktionary)\\.(org|com)/wiki/([^:\"]*)\"";
text.replace( QRegularExpression( "<\\s*a\\s+(class=\"external\"\\s+|)href=" + urlWiki ),
QString( "<a href=\"gdlookup://localhost/\\6\"" ) );
#else
text.replace( QRegExp( "<\\s*body\\s+([^>]*)(background(|-color)):([^;\"]*(|;))" ),
QString( "<body \\1" ) );
text.replace( QRegExp( "<\\s*(img|script)\\s+([^>]*)src=(\"|)(\\.\\.|)/" ),
QString( "<\\1 \\2src=\\3bres://%1/").arg( getId().c_str() ) );
text.replace( QRegExp( "href=(\\.\\.|)/([^\\s>]+)" ), QString( "href=\"\\1/\\2\"" ) );
text.replace( QRegExp( "<\\s*link\\s+([^>]*)href=\"(\\.\\.|)/" ),
QString( "<link \\1href=\"bres://%1/").arg( getId().c_str() ) );
QString urlWiki = "\"http(s|)://en\\.(wiki(pedia|books|news|quote|source|voyage|versity)|wiktionary)\\.(org|com)/wiki/([^:\"]*)\"";
text.replace( QRegExp( "<\\s*a\\s+(class=\"external\"\\s+|)href=" + urlWiki ),
QString( "<a href=\"gdlookup://localhost/\\6\"" ) );
#endif
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
QRegularExpression rxLink( "<\\s*a\\s+([^>]*)href=\"(?!(?:\\w+://|#|mailto:|tel:))(/|)([^\"]*)\"\\s*(title=\"[^\"]*\")?[^>]*>" );
QRegularExpressionMatchIterator it = rxLink.globalMatch( text );
int pos = 0;
QString newText;
while( it.hasNext() )
{
QRegularExpressionMatch match = it.next();
newText += text.midRef( pos, match.capturedStart() - pos );
pos = match.capturedEnd();
QStringList list = match.capturedTexts();
for( int i = match.lastCapturedIndex() + 1; i < 5; i++ )
list.append( QString() );
#else
QRegExp rxLink( "<\\s*a\\s+([^>]*)href=\"(?!(\\w+://|#|mailto:|tel:))(/|)([^\"]*)\"\\s*(title=\"[^\"]*\")?[^>]*>",
Qt::CaseSensitive,
QRegExp::RegExp2 );
int pos = 0;
while( (pos = rxLink.indexIn( text, pos )) >= 0 )
{
QStringList list = rxLink.capturedTexts();
#endif
QString tag = list[3]; if ( !list[4].isEmpty() ) tag = list[4].split("\"")[1];
if( linksType == UNKNOWN && tag.indexOf( '/' ) >= 0 )
{
QString word = QUrl::fromPercentEncoding( tag.toLatin1() );
word.remove( QRegExp( "\\.(s|)htm(l|)$", Qt::CaseInsensitive ) ).
replace( "_", " " );
vector< WordArticleLink > links;
links = findArticles( gd::toWString( word ) );
if( !links.empty() )
{
linksType = SLASH;
}
else
{
word.remove( QRegExp(".*/") );
links = findArticles( gd::toWString( word ) );
if( !links.empty() )
{
linksType = NO_SLASH;
links.clear();
}
}
}
if( linksType == SLASH || linksType == UNKNOWN )
{
tag.remove( QRegExp( "\\.(s|)htm(l|)$", Qt::CaseInsensitive ) ).
replace( "_", "%20" ).
prepend( "<a href=\"gdlookup://localhost/" ).
append( "\" " + list[4] + ">" );
}
else
{
tag.remove( QRegExp(".*/") ).
remove( QRegExp( "\\.(s|)htm(l|)$", Qt::CaseInsensitive ) ).
replace( "_", "%20" ).
prepend( "<a href=\"gdlookup://localhost/" ).
append( "\" " + list[4] + ">" );
}
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
newText += tag;
}
if( pos )
{
newText += text.midRef( pos );
text = newText;
}
newText.clear();
#else
text.replace( pos, list[0].length(), tag );
pos += tag.length() + 1;
}
#endif
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
QRegularExpression rxBR( "(<a href=\"gdlookup://localhost/[^\"]*\"\\s*[^>]*>)\\s*((\\w\\s*<br(\\\\|/|)>\\s*)+\\w)\\s*</a>",
QRegularExpression::UseUnicodePropertiesOption );
pos = 0;
QRegularExpressionMatchIterator it2 = rxLink.globalMatch( text );
while( it2.hasNext() )
{
QRegularExpressionMatch match = it.next();
newText += text.midRef( pos, match.capturedStart() - pos );
pos = match.capturedEnd();
QStringList list = match.capturedTexts();
for( int i = match.lastCapturedIndex() + 1; i < 3; i++ )
list.append( QString() );
QString tag = list[2];
tag.replace( QRegExp( "<br( |)(\\\\|/|)>", Qt::CaseInsensitive ) , "<br/>" ).
prepend( list[1] ).
append( "</a>" );
newText += tag;
}
if( pos )
{
newText += text.midRef( pos );
text = newText;
}
newText.clear();
#else
QRegExp rxBR( "(<a href=\"gdlookup://localhost/[^\"]*\"\\s*[^>]*>)\\s*((\\w\\s*<br(\\\\|/|)>\\s*)+\\w)\\s*</a>",
Qt::CaseSensitive,
QRegExp::RegExp2 );
pos = 0;
while( (pos = rxBR.indexIn( text, pos )) >= 0 )
{
QStringList list = rxBR.capturedTexts();
QString tag = list[2];
tag.replace( QRegExp( "<br( |)(\\\\|/|)>", Qt::CaseInsensitive ) , "<br/>" ).
prepend( list[1] ).
append( "</a>" );
text.replace( pos, list[0].length(), tag );
pos += tag.length() + 1;
}
#endif
text += "<br style=\"clear:both;\" />";
return text.toUtf8().data();
}
void ZimDictionary::loadResource( std::string & resourceName, string & data )
{
vector< WordArticleLink > link;
string resData;
link = resourceIndex.findArticles( Utf8::decode( resourceName ) );
if( link.empty() )
return;
{
Mutex::Lock _( zimMutex );
readArticle( df, link[ 0 ].articleOffset, data );
}
}
QString const& ZimDictionary::getDescription()
{
if( !dictionaryDescription.isEmpty() || idxHeader.descriptionPtr == 0xFFFFFFFF )
return dictionaryDescription;
string str;
{
Mutex::Lock _( zimMutex );
readArticle( df, idxHeader.descriptionPtr, str );
}
if( !str.empty() )
dictionaryDescription = QString::fromUtf8( str.c_str(), str.size() );
return dictionaryDescription;
}
void ZimDictionary::makeFTSIndex( QAtomicInt & isCancelled, bool firstIteration )
{
if( !( Dictionary::needToRebuildIndex( getDictionaryFilenames(), ftsIdxName )
|| FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) ) )
FTS_index_completed.ref();
if( haveFTSIndex() )
return;
if( ensureInitDone().size() )
return;
if( firstIteration )
return;
gdDebug( "Zim: Building the full-text index for dictionary: %s\n",
getName().c_str() );
try
{
Mutex::Lock _( getFtsMutex() );
File::Class ftsIdx( ftsIndexName(), "wb" );
FtsHelpers::FtsIdxHeader ftsIdxHeader;
memset( &ftsIdxHeader, 0, sizeof( ftsIdxHeader ) );
ftsIdx.write( ftsIdxHeader );
ChunkedStorage::Writer chunks( ftsIdx );
BtreeIndexing::IndexedWords indexedWords;
QSet< uint32_t > setOfOffsets;
setOfOffsets.reserve( getWordCount() );
findArticleLinks( 0, &setOfOffsets, 0, &isCancelled );
if( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
QVector< QPair< quint32, uint32_t > > offsetsWithClusters;
offsetsWithClusters.reserve( setOfOffsets.size() );
for( QSet< uint32_t >::ConstIterator it = setOfOffsets.constBegin();
it != setOfOffsets.constEnd(); ++it )
{
if( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
Mutex::Lock _( zimMutex );
offsetsWithClusters.append( QPair< uint32_t, quint32 >( getArticleCluster( df, *it ), *it ) );
}
setOfOffsets.clear();
if( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
std::sort( offsetsWithClusters.begin(), offsetsWithClusters.end() );
QVector< uint32_t > offsets;
offsets.resize( offsetsWithClusters.size() );
for( int i = 0; i < offsetsWithClusters.size(); i++ )
offsets[ i ] = offsetsWithClusters.at( i ).second;
offsetsWithClusters.clear();
offsetsWithClusters.squeeze();
if( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
QMap< QString, QVector< uint32_t > > ftsWords;
set< quint32 > indexedArticles;
quint32 articleNumber;
for( int i = 0; i < offsets.size(); i++ )
{
if( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
QString headword, articleStr;
articleNumber = getArticleText( offsets.at( i ), headword, articleStr,
&indexedArticles );
if( articleNumber == 0xFFFFFFFF )
continue;
indexedArticles.insert( articleNumber );
FtsHelpers::parseArticleForFts( offsets.at( i ), articleStr, ftsWords );
}
indexedArticles.clear();
offsets.clear();
offsets.squeeze();
# define BUF_SIZE 20000
QVector< QPair< wstring, uint32_t > > wordsWithOffsets;
wordsWithOffsets.reserve( BUF_SIZE );
QMap< QString, QVector< uint32_t > >::iterator it = ftsWords.begin();
while( it != ftsWords.end() )
{
if( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
uint32_t offset = chunks.startNewBlock();
uint32_t size = it.value().size();
chunks.addToBlock( &size, sizeof(uint32_t) );
chunks.addToBlock( it.value().data(), size * sizeof(uint32_t) );
wordsWithOffsets.append( QPair< wstring, uint32_t >( gd::toWString( it.key() ), offset ) );
it = ftsWords.erase( it );
if( wordsWithOffsets.size() >= BUF_SIZE )
{
for( int i = 0; i < wordsWithOffsets.size(); i++ )
{
if( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
indexedWords.addSingleWord( wordsWithOffsets[ i ].first, wordsWithOffsets[ i ].second );
}
wordsWithOffsets.clear();
}
}
ftsWords.clear();
for( int i = 0; i < wordsWithOffsets.size(); i++ )
{
if( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
indexedWords.addSingleWord( wordsWithOffsets[ i ].first, wordsWithOffsets[ i ].second );
}
#undef BUF_SIZE
wordsWithOffsets.clear();
wordsWithOffsets.squeeze();
if( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
ftsIdxHeader.chunksOffset = chunks.finish();
ftsIdxHeader.wordCount = indexedWords.size();
if( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
BtreeIndexing::IndexInfo ftsIdxInfo = BtreeIndexing::buildIndex( indexedWords, ftsIdx );
indexedWords.clear();
ftsIdxHeader.indexBtreeMaxElements = ftsIdxInfo.btreeMaxElements;
ftsIdxHeader.indexRootOffset = ftsIdxInfo.rootOffset;
ftsIdxHeader.signature = FtsHelpers::FtsSignature;
ftsIdxHeader.formatVersion = FtsHelpers::CurrentFtsFormatVersion + getFtsIndexVersion();
ftsIdx.rewind();
ftsIdx.writeRecords( &ftsIdxHeader, sizeof(ftsIdxHeader), 1 );
FTS_index_completed.ref();
}
catch( std::exception &ex )
{
gdWarning( "Zim: Failed building full-text search index for \"%s\", reason: %s\n", getName().c_str(), ex.what() );
QFile::remove( FsEncoding::decode( ftsIdxName.c_str() ) );
}
}
void ZimDictionary::sortArticlesOffsetsForFTS( QVector< uint32_t > & offsets,
QAtomicInt & isCancelled )
{
QVector< QPair< quint32, uint32_t > > offsetsWithClusters;
offsetsWithClusters.reserve( offsets.size() );
for( QVector< uint32_t >::ConstIterator it = offsets.constBegin();
it != offsets.constEnd(); ++it )
{
if( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
return;
Mutex::Lock _( zimMutex );
offsetsWithClusters.append( QPair< uint32_t, quint32 >( getArticleCluster( df, *it ), *it ) );
}
std::sort( offsetsWithClusters.begin(), offsetsWithClusters.end() );
for( int i = 0; i < offsetsWithClusters.size(); i++ )
offsets[ i ] = offsetsWithClusters.at( i ).second;
}
void ZimDictionary::getArticleText( uint32_t articleAddress, QString & headword, QString & text )
{
try
{
headword.clear();
string articleText;
loadArticle( articleAddress, articleText, 0, true );
text = Html::unescape( QString::fromUtf8( articleText.data(), articleText.size() ) );
}
catch( std::exception &ex )
{
gdWarning( "Zim: Failed retrieving article from \"%s\", reason: %s\n", getName().c_str(), ex.what() );
}
}
quint32 ZimDictionary::getArticleText( uint32_t articleAddress, QString & headword, QString & text,
set< quint32 > * loadedArticles )
{
quint32 articleNumber = 0xFFFFFFFF;
try
{
headword.clear();
string articleText;
articleNumber = loadArticle( articleAddress, articleText, loadedArticles, true );
text = Html::unescape( QString::fromUtf8( articleText.data(), articleText.size() ) );
}
catch( std::exception &ex )
{
gdWarning( "Zim: Failed retrieving article from \"%s\", reason: %s\n", getName().c_str(), ex.what() );
}
return articleNumber;
}
sptr< Dictionary::DataRequest > ZimDictionary::getSearchResults( QString const & searchString,
int searchMode, bool matchCase,
int distanceBetweenWords,
int maxResults,
bool ignoreWordsOrder,
bool ignoreDiacritics,
QThreadPool * ftsThreadPoolPtr )
{
return new FtsHelpers::FTSResultsRequest( *this, searchString,searchMode, matchCase, distanceBetweenWords, maxResults, ignoreWordsOrder, ignoreDiacritics, ftsThreadPoolPtr );
}
class ZimArticleRequest;
class ZimArticleRequestRunnable: public QRunnable
{
ZimArticleRequest & r;
QSemaphore & hasExited;
public:
ZimArticleRequestRunnable( ZimArticleRequest & r_,
QSemaphore & hasExited_ ): r( r_ ),
hasExited( hasExited_ )
{}
~ZimArticleRequestRunnable()
{
hasExited.release();
}
virtual void run();
};
class ZimArticleRequest: public Dictionary::DataRequest
{
friend class ZimArticleRequestRunnable;
wstring word;
vector< wstring > alts;
ZimDictionary & dict;
bool ignoreDiacritics;
QAtomicInt isCancelled;
QSemaphore hasExited;
public:
ZimArticleRequest( wstring const & word_,
vector< wstring > const & alts_,
ZimDictionary & dict_, bool ignoreDiacritics_ ):
word( word_ ), alts( alts_ ), dict( dict_ ), ignoreDiacritics( ignoreDiacritics_ )
{
QThreadPool::globalInstance()->start(
new ZimArticleRequestRunnable( *this, hasExited ) );
}
void run();
virtual void cancel()
{
isCancelled.ref();
}
~ZimArticleRequest()
{
isCancelled.ref();
hasExited.acquire();
}
};
void ZimArticleRequestRunnable::run()
{
r.run();
}
void ZimArticleRequest::run()
{
if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
{
finish();
return;
}
vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics );
for( unsigned x = 0; x < alts.size(); ++x )
{
vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics );
chain.insert( chain.end(), altChain.begin(), altChain.end() );
}
multimap< wstring, pair< string, string > > mainArticles, alternateArticles;
set< quint32 > articlesIncluded;
wstring wordCaseFolded = Folding::applySimpleCaseOnly( word );
if( ignoreDiacritics )
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
for( unsigned x = 0; x < chain.size(); ++x )
{
if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
{
finish();
return;
}
string headword, articleText;
headword = chain[ x ].word;
quint32 articleNumber = 0xFFFFFFFF;
try
{
articleNumber = dict.loadArticle( chain[ x ].articleOffset, articleText, &articlesIncluded );
}
catch(...)
{
}
if( articleNumber == 0xFFFFFFFF )
continue;
if ( articlesIncluded.find( articleNumber ) != articlesIncluded.end() )
continue;
wstring headwordStripped =
Folding::applySimpleCaseOnly( Utf8::decode( headword ) );
if( ignoreDiacritics )
headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
multimap< wstring, pair< string, string > > & mapToUse =
( wordCaseFolded == headwordStripped ) ?
mainArticles : alternateArticles;
mapToUse.insert( pair< wstring, pair< string, string > >(
Folding::applySimpleCaseOnly( Utf8::decode( headword ) ),
pair< string, string >( headword, articleText ) ) );
articlesIncluded.insert( articleNumber );
}
if ( mainArticles.empty() && alternateArticles.empty() )
{
finish();
return;
}
string result;
string cleaner = "</font>""</font>""</font>""</font>""</font>""</font>"
"</font>""</font>""</font>""</font>""</font>""</font>"
"</b></b></b></b></b></b></b></b>"
"</i></i></i></i></i></i></i></i>"
"</a></a></a></a></a></a></a></a>";
multimap< wstring, pair< string, string > >::const_iterator i;
for( i = mainArticles.begin(); i != mainArticles.end(); ++i )
{
result += "<div class=\"zimdict\">";
result += "<h2 class=\"zimdict_headword\">";
result += i->second.first;
result += "</h2>";
result += i->second.second;
result += cleaner + "</div>";
}
for( i = alternateArticles.begin(); i != alternateArticles.end(); ++i )
{
result += "<div class=\"zimdict\">";
result += "<h2 class=\"zimdict_headword\">";
result += i->second.first;
result += "</h2>";
result += i->second.second;
result += cleaner + "</div>";
}
Mutex::Lock _( dataMutex );
data.resize( result.size() );
memcpy( &data.front(), result.data(), result.size() );
hasAnyData = true;
finish();
}
sptr< Dictionary::DataRequest > ZimDictionary::getArticle( wstring const & word,
vector< wstring > const & alts,
wstring const &,
bool ignoreDiacritics )
THROW_SPEC( std::exception )
{
return new ZimArticleRequest( word, alts, *this, ignoreDiacritics );
}
class ZimResourceRequest;
class ZimResourceRequestRunnable: public QRunnable
{
ZimResourceRequest & r;
QSemaphore & hasExited;
public:
ZimResourceRequestRunnable( ZimResourceRequest & r_,
QSemaphore & hasExited_ ): r( r_ ),
hasExited( hasExited_ )
{}
~ZimResourceRequestRunnable()
{
hasExited.release();
}
virtual void run();
};
class ZimResourceRequest: public Dictionary::DataRequest
{
friend class ZimResourceRequestRunnable;
ZimDictionary & dict;
string resourceName;
QAtomicInt isCancelled;
QSemaphore hasExited;
public:
ZimResourceRequest( ZimDictionary & dict_,
string const & resourceName_ ):
dict( dict_ ),
resourceName( resourceName_ )
{
QThreadPool::globalInstance()->start(
new ZimResourceRequestRunnable( *this, hasExited ) );
}
void run();
virtual void cancel()
{
isCancelled.ref();
}
~ZimResourceRequest()
{
isCancelled.ref();
hasExited.acquire();
}
};
void ZimResourceRequestRunnable::run()
{
r.run();
}
void ZimResourceRequest::run()
{
if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
{
finish();
return;
}
try
{
string resource;
dict.loadResource( resourceName, resource );
if( resource.empty() )
throw File::Ex();
if( Filetype::isNameOfCSS( resourceName ) )
{
QString css = QString::fromUtf8( resource.data(), resource.size() );
dict.isolateCSS( css, ".zimdict" );
QByteArray bytes = css.toUtf8();
Mutex::Lock _( dataMutex );
data.resize( bytes.size() );
memcpy( &data.front(), bytes.constData(), bytes.size() );
}
else
if ( Filetype::isNameOfTiff( resourceName ) )
{
dataMutex.lock();
QImage img = QImage::fromData( reinterpret_cast< const uchar * >( resource.data() ), resource.size() );
#ifdef MAKE_EXTRA_TIFF_HANDLER
if( img.isNull() )
GdTiff::tiffToQImage( &data.front(), data.size(), img );
#endif
dataMutex.unlock();
if ( !img.isNull() )
{
QByteArray ba;
QBuffer buffer( &ba );
buffer.open( QIODevice::WriteOnly );
img.save( &buffer, "BMP" );
Mutex::Lock _( dataMutex );
data.resize( buffer.size() );
memcpy( &data.front(), buffer.data(), data.size() );
}
}
else
{
Mutex::Lock _( dataMutex );
data.resize( resource.size() );
memcpy( &data.front(), resource.data(), data.size() );
}
Mutex::Lock _( dataMutex );
hasAnyData = true;
}
catch( std::exception &ex )
{
gdCWarning( dictionaryResourceLc, "ZIM: Failed loading resource \"%s\" from \"%s\", reason: %s\n",
resourceName.c_str(), dict.getName().c_str(), ex.what() );
}
finish();
}
sptr< Dictionary::DataRequest > ZimDictionary::getResource( string const & name )
THROW_SPEC( std::exception )
{
return new ZimResourceRequest( *this, name );
}
vector< sptr< Dictionary::Class > > makeDictionaries(
vector< string > const & fileNames,
string const & indicesDir,
Dictionary::Initializing & initializing,
unsigned maxHeadwordsToExpand )
THROW_SPEC( std::exception )
{
vector< sptr< Dictionary::Class > > dictionaries;
for( vector< string >::const_iterator i = fileNames.begin(); i != fileNames.end();
++i )
{
QString firstName = QDir::fromNativeSeparators( FsEncoding::decode( i->c_str() ) );
if( !firstName.endsWith( ".zim") && !firstName.endsWith( ".zimaa" ) )
continue;
ZimFile df( firstName );
vector< string > dictFiles;
df.getFilenames( dictFiles );
string dictId = Dictionary::makeDictionaryId( dictFiles );
string indexFile = indicesDir + dictId;
try
{
if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) ||
indexIsOldOrBad( indexFile ) )
{
gdDebug( "Zim: Building the index for dictionary: %s\n", i->c_str() );
unsigned articleCount = 0;
unsigned wordCount = 0;
df.open();
ZIM_header const & zh = df.header();
if( zh.magicNumber != 0x44D495A )
throw exNotZimFile( i->c_str() );
if( zh.mimeListPos != sizeof( ZIM_header ) )
throw exInvalidZimHeader( i->c_str() );
bool new_namespaces = ( zh.majorVersion >= 6 && zh.minorVersion >= 1 );
{
int n = firstName.lastIndexOf( '/' );
initializing.indexingDictionary( firstName.mid( n + 1 ).toUtf8().constData() );
}
File::Class idx( indexFile, "wb" );
IdxHeader idxHeader;
memset( &idxHeader, 0, sizeof( idxHeader ) );
idxHeader.namePtr = 0xFFFFFFFF;
idxHeader.descriptionPtr = 0xFFFFFFFF;
idxHeader.iconPtr = 0xFFFFFFFF;
idx.write( idxHeader );
IndexedWords indexedWords, indexedResources;
QByteArray artEntries;
df.seek( zh.urlPtrPos );
artEntries = df.read( (quint64)zh.articleCount * 8 );
QVector< quint64 > clusters;
clusters.reserve( zh.clusterCount );
df.seek( zh.clusterPtrPos );
{
QByteArray data = df.read( (quint64)zh.clusterCount * 8 );
for( unsigned n = 0; n < zh.clusterCount; n++ )
clusters.append( *( reinterpret_cast< const quint64 * >( data.constData() ) + n ) );
}
const quint64 * ptr;
quint16 mimetype, redirected_mime = 0xFFFF;
ArticleEntry artEntry;
RedirectEntry redEntry;
string url, title;
char nameSpace;
for( unsigned n = 0; n < zh.articleCount; n++ )
{
ptr = reinterpret_cast< const quint64 * >( artEntries.constData() ) + n;
df.seek( *ptr );
df.read( reinterpret_cast< char * >( &mimetype ), sizeof(mimetype) );
if( mimetype == 0xFFFF )
{
redEntry.mimetype = mimetype;
qint64 ret = df.read( reinterpret_cast< char * >( &redEntry ) + 2, sizeof(RedirectEntry) - 2 );
if( ret != sizeof(RedirectEntry) - 2 )
throw exCantReadFile( i->c_str() );
redirected_mime = df.redirectedMimeType( redEntry );
nameSpace = redEntry.nameSpace;
}
else
{
artEntry.mimetype = mimetype;
qint64 ret = df.read( reinterpret_cast< char * >( &artEntry ) + 2, sizeof(ArticleEntry) - 2 );
if( ret != sizeof(ArticleEntry) - 2 )
throw exCantReadFile( i->c_str() );
nameSpace = artEntry.nameSpace;
if( ( nameSpace == 'A' || ( nameSpace == 'C' && new_namespaces ) ) && df.isArticleMime( mimetype ) )
articleCount++;
}
char ch;
url.clear();
while( df.getChar( &ch ) )
{
if( ch == 0 )
break;
url.push_back( ch );
}
title.clear();
while( df.getChar( &ch ) )
{
if( ch == 0 )
break;
title.push_back( ch );
}
if( nameSpace == 'A' || ( nameSpace == 'C' && new_namespaces && ( df.isArticleMime( mimetype )
|| ( mimetype == 0xFFFF && df.isArticleMime( redirected_mime ) ) ) ) )
{
wstring word;
if( !title.empty() )
word = Utf8::decode( title );
else
word = Utf8::decode( url );
if( df.isArticleMime( mimetype )
|| ( mimetype == 0xFFFF && df.isArticleMime( redirected_mime ) ) )
{
if( maxHeadwordsToExpand && zh.articleCount >= maxHeadwordsToExpand )
indexedWords.addSingleWord( word, n );
else
indexedWords.addWord( word, n );
wordCount++;
}
else
{
url.insert( url.begin(), '/' );
url.insert( url.begin(), nameSpace );
indexedResources.addSingleWord( Utf8::decode( url ), n );
}
}
else
if( nameSpace == 'M' )
{
if( url.compare( "Title" ) == 0 )
{
idxHeader.namePtr = n;
string name;
readArticle( df, n, name );
initializing.indexingDictionary( name );
}
else
if( url.compare( "Description" ) == 0 )
idxHeader.descriptionPtr = n;
else
if( url.compare( "Language" ) == 0 )
{
string lang;
readArticle( df, n, lang );
if( lang.size() == 2 )
idxHeader.langFrom = LangCoder::code2toInt( lang.c_str() );
else
if( lang.size() == 3 )
idxHeader.langFrom = LangCoder::findIdForLanguageCode3( lang.c_str() );
idxHeader.langTo = idxHeader.langFrom;
}
else
if( url.compare( "Illustration_48x48@1" ) == 0 )
idxHeader.iconPtr = n;
}
else
if( nameSpace == 'X' )
{
continue;
}
else
{
if( nameSpace == '-' && idxHeader.iconPtr == 0xFFFFFFFF && url.compare( "favicon" ) == 0 )
idxHeader.iconPtr = n;
url.insert( url.begin(), '/' );
url.insert( url.begin(), nameSpace );
indexedResources.addSingleWord( Utf8::decode( url ), n );
}
}
{
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
idxHeader.indexRootOffset = idxInfo.rootOffset;
indexedWords.clear(); }
{
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedResources, idx );
idxHeader.resourceIndexBtreeMaxElements = idxInfo.btreeMaxElements;
idxHeader.resourceIndexRootOffset = idxInfo.rootOffset;
indexedResources.clear(); }
idxHeader.signature = Signature;
idxHeader.formatVersion = CurrentFormatVersion;
idxHeader.articleCount = articleCount;
idxHeader.wordCount = wordCount;
idx.rewind();
idx.write( &idxHeader, sizeof( idxHeader ) );
}
dictionaries.push_back( new ZimDictionary( dictId,
indexFile,
dictFiles ) );
}
catch( std::exception & e )
{
gdWarning( "Zim dictionary initializing failed: %s, error: %s\n",
i->c_str(), e.what() );
continue;
}
catch( ... )
{
qWarning( "Zim dictionary initializing failed\n" );
continue;
}
}
return dictionaries;
}
}
#endif