edgesearch 0.4.1

"use strict";
const exists = (val) => val !== undefined;
// Easy reading and writing of memory sequentially without having to manage and update offsets/positions/pointers.
class MemoryWalker {
    constructor(buffer, next = 0) {
        this.buffer = buffer;
        this.next = next;
        this.dataView = new DataView(buffer);
        this.uint8Array = new Uint8Array(buffer);
    }
    jumpTo(ptr) {
        this.next = ptr;
        return this;
    }
    forkAndJump(ptr) {
        return new MemoryWalker(this.buffer, ptr);
    }
    skip(bytes) {
        this.next += bytes;
        return this;
    }
    readAndDereferencePointer() {
        return new MemoryWalker(this.buffer, this.readUInt32LE());
    }
    readSlice(len) {
        return this.buffer.slice(this.next, this.next += len);
    }
    readBoolean() {
        return !!this.dataView.getUint8(this.next++);
    }
    readUInt8() {
        return this.dataView.getUint8(this.next++);
    }
    readInt32LE() {
        const val = this.dataView.getInt32(this.next, true);
        this.next += 4;
        return val;
    }
    readInt32BE() {
        const val = this.dataView.getInt32(this.next, false);
        this.next += 4;
        return val;
    }
    readUInt32LE() {
        const val = this.dataView.getUint32(this.next, true);
        this.next += 4;
        return val;
    }
    readUInt32BE() {
        const val = this.dataView.getUint32(this.next, false);
        this.next += 4;
        return val;
    }
    readInt64LE() {
        const val = this.dataView.getBigInt64(this.next, true);
        this.next += 8;
        return val;
    }
    readUInt64LE() {
        const val = this.dataView.getBigUint64(this.next, true);
        this.next += 8;
        return val;
    }
    readDoubleLE() {
        const val = this.dataView.getFloat64(this.next, true);
        this.next += 8;
        return val;
    }
    readNullTerminatedString() {
        let end = this.next;
        while (this.uint8Array[end]) {
            end++;
        }
        const val = textDecoder.decode(this.uint8Array.slice(this.next, end));
        this.next = end + 1;
        return val;
    }
    writeUInt32LE(val) {
        this.dataView.setUint32(this.next, val, true);
        this.next += 4;
        return this;
    }
    writeAll(src) {
        this.uint8Array.set(src, this.next);
        this.next += src.byteLength;
        return this;
    }
}
const SPECIFIER_PARSERS = [
    { length: new Set(['hh', 'h', 'l', 'z', 't', '']), type: new Set('dic'), parse: mem => mem.readInt32LE() },
    { length: new Set(['hh', 'h', 'l', 'z', 't', '']), type: new Set('uxXop'), parse: mem => mem.readUInt32LE() },
    { length: new Set(['ll', 'j']), type: new Set('di'), parse: mem => mem.readInt64LE() },
    { length: new Set(['ll', 'j']), type: new Set('uxXop'), parse: mem => mem.readUInt64LE() },
    { length: new Set(['L', '']), type: new Set('fFeEgGaA'), parse: mem => mem.readDoubleLE() },
    { length: new Set(), type: new Set('s'), parse: mem => mem.readAndDereferencePointer().readNullTerminatedString() },
    { length: new Set(), type: new Set('%'), parse: () => '%' },
];
const SPECIFIER_FORMATTERS = {
    '%': () => '%',
    d: (val) => val.toString(),
    i: (val) => val.toString(),
    u: (val) => val.toString(),
    f: (val) => val.toLocaleString('fullwide', { useGrouping: false, maximumFractionDigits: 20 }),
    F: (val) => val.toLocaleString('fullwide', { useGrouping: false, maximumFractionDigits: 20 }).toUpperCase(),
    e: (val) => val.toExponential(2),
    E: (val) => val.toExponential(2).toUpperCase(),
    g: (val) => val.toString(),
    G: (val) => val.toString().toUpperCase(),
    x: (val) => val.toString(16),
    X: (val) => val.toString(16).toUpperCase(),
    o: (val) => val.toString(8),
    s: (val) => val,
    c: (val) => String.fromCharCode(val),
    p: (val) => val.toString(16),
    a: (val) => val.toString(16),
    A: (val) => val.toString(16).toUpperCase(),
};
const formatFromVarargs = (mem) => mem
    .readAndDereferencePointer()
    .readNullTerminatedString()
    .replace(/%([-+ 0'#]*)((?:[0-9]+|\*)?)((?:\.(?:[0-9]+|\*))?)((?:hh|h|l|ll|L|z|j|t|I|I32|I64|q)?)([%diufFeEgGxXoscpaA])/g, ((spec, flags, width, precision, length, type) => {
    // These aren't used in our C code right now but we can implement later on if we do.
    if (flags || width || precision) {
        throw new Error(`Unsupported format specifier "${spec}"`);
    }
    const parser = SPECIFIER_PARSERS.find(p => p.length.has(length) && p.type.has(type));
    if (!parser) {
        throw new SyntaxError(`Invalid format specifier "${spec}"`);
    }
    const rawValue = parser.parse(mem);
    return SPECIFIER_FORMATTERS[type](rawValue);
}));
const wasmMemory = new WebAssembly.Memory({ initial: 1024 });
const wasmInstance = new WebAssembly.Instance(QUERY_RUNNER_WASM, {
    env: {
        _wasm_import_log(argsPtr) {
            console.log(formatFromVarargs(queryRunnerMemory.forkAndJump(argsPtr)));
        },
        _wasm_import_error(argsPtr) {
            throw new Error(`[fprintf] ${formatFromVarargs(queryRunnerMemory.forkAndJump(argsPtr))}`);
        },
        memory: wasmMemory,
    },
});
const queryRunner = wasmInstance.exports;
const queryRunnerMemory = new MemoryWalker(wasmMemory.buffer);
const textEncoder = new TextEncoder();
const textDecoder = new TextDecoder();
const CORS_HEADERS = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET, HEAD, POST, OPTIONS',
    'Access-Control-Allow-Headers': 'Content-Type',
};
const responsePreflight = () => new Response(null, {
    headers: CORS_HEADERS,
});
const responseError = (error, status = 400) => new Response(JSON.stringify({ error }), {
    status, headers: {
        'Content-Type': 'application/json',
        ...CORS_HEADERS,
    },
});
const responseRawJson = (json, status = 200) => new Response(json, {
    status, headers: {
        'Content-Type': 'application/json',
        ...CORS_HEADERS,
    },
});
const responseNoResults = () => responseRawJson(`{"results":[],"continuation":null,"total":0}`);
const allocateKey = (key) => {
    if (typeof key == 'string') {
        const encoded = textEncoder.encode(key);
        const len = encoded.length;
        const ptr = queryRunner.malloc(len);
        queryRunnerMemory.forkAndJump(ptr).writeAll(encoded);
        return { ptr, len };
    }
    else {
        return key;
    }
};
const findContainingChunk = (key) => {
    let chunkRefPtr;
    let cKey = allocateKey(key);
    if (typeof cKey == 'number') {
        chunkRefPtr = queryRunner.find_chunk_containing_doc(cKey);
    }
    else {
        chunkRefPtr = queryRunner.find_chunk_containing_term(cKey.ptr, cKey.len);
    }
    console.log('Found containing chunk');
    if (chunkRefPtr === 0) {
        return undefined;
    }
    const chunkRef = queryRunnerMemory.forkAndJump(chunkRefPtr);
    const chunkId = chunkRef.readUInt32LE();
    const chunkMidPos = chunkRef.readUInt32LE();
    return { id: chunkId, midPos: chunkMidPos };
};
const compareKey = (a, b) => {
    return typeof a == 'number' ? a - b : a.localeCompare(b);
};
const extractKeyAtPosInBstChunkJs = (chunk, type) => {
    if (type == 'string') {
        // Keep in sync with build::chunks::ChunkStrKey.
        const len = chunk.readUInt8();
        return textDecoder.decode(chunk.readSlice(len));
    }
    else {
        // Keep in sync with build::ChunkU32Key.
        return chunk.readUInt32LE();
    }
};
const searchInBstChunkJs = (chunk, targetKey) => {
    while (true) {
        const currentKey = extractKeyAtPosInBstChunkJs(chunk, typeof targetKey);
        // Keep in sync with build::chunks::bst::BST::_serialise_node.
        const leftPos = chunk.readInt32LE();
        const rightPos = chunk.readInt32LE();
        const valueLen = chunk.readUInt32LE();
        const cmp = compareKey(targetKey, currentKey);
        if (cmp < 0) {
            if (leftPos == -1) {
                break;
            }
            chunk.jumpTo(leftPos);
        }
        else if (cmp == 0) {
            console.log('Found entry in chunk');
            return chunk.readSlice(valueLen);
        }
        else {
            if (rightPos == -1) {
                break;
            }
            chunk.jumpTo(rightPos);
        }
    }
    console.log('Searched failed to find entry in chunk');
    return undefined;
};
const findAllInChunks = async (chunkIdPrefix, keys) => {
    const results = [];
    // Group by chunk to avoid repeated fetches and memory management.
    const chunks = new Map();
    for (const key of keys) {
        const chunkRef = findContainingChunk(key);
        // We reserve a spot in `results` and keep track of it so that results are in the same order as `keys`,
        // and missing keys have `undefined` and can be detected.
        const resultIdx = results.push(undefined) - 1;
        if (!chunkRef) {
            continue;
        }
        if (!chunks.has(chunkRef.id)) {
            chunks.set(chunkRef.id, {
                keys: [],
                midPos: chunkRef.midPos,
            });
        }
        chunks.get(chunkRef.id).keys.push([key, resultIdx]);
    }
    // We want to process chunks one by one as otherwise we will run into memory limits
    // from fetching and allocating memory for too many at once.
    for (const [chunkId, { keys, midPos }] of chunks.entries()) {
        const chunkData = await fetchChunk(chunkIdPrefix, chunkId);
        // We need to reset as otherwise we might overflow memory with unused previous chunks.
        // queryRunner.reset();
        // const res = searchInBstChunk(chunkData, chunkRef.midPos, key);
        for (const [key, resultIdx] of keys) {
            const entry = searchInBstChunkJs(new MemoryWalker(chunkData).jumpTo(midPos), key);
            if (!entry) {
                continue;
            }
            results[resultIdx] = entry;
        }
    }
    return results;
};
// Take a raw query string and parse in into an array with three subarrays, each subarray representing terms for a mode.
const parseQuery = (termsRaw) => {
    const modeTerms = [
        Array(),
        Array(),
        Array(),
    ];
    for (const value of termsRaw) {
        // Synchronise mode IDs with mode_t enum in wasm/index.c.
        const matches = /^([012])_([^&]+)(?:&|$)/.exec(value);
        if (!matches) {
            return;
        }
        const mode = Number.parseInt(matches[1], 10);
        const term = decodeURIComponent(matches[2]);
        modeTerms[mode].push(term);
    }
    return modeTerms;
};
const readResult = (result) => {
    // Synchronise with `results_t` in wasm/index.c.
    const continuation = result.readInt32LE();
    const total = result.readUInt32LE();
    const count = result.readUInt8();
    // Starts from next WORD_SIZE (uint32_t) due to alignment.
    result.skip(3);
    const documents = [];
    for (let resultNo = 0; resultNo < count; resultNo++) {
        // Synchronise with `doc_id_t` in wasm/index.c.
        const docId = result.readUInt32LE();
        documents.push(docId);
    }
    return { continuation: continuation == -1 ? null : continuation, total, documents };
};
const findSerialisedTermBitmaps = (query) => 
// Keep in sync with deploy/mod.rs.
Promise.all(query.map(modeTerms => findAllInChunks('terms/', modeTerms)));
const buildIndexQuery = async (firstRank, modeTermBitmaps) => {
    const bitmapCount = modeTermBitmaps.reduce((count, modeTerms) => count + modeTerms.length, 0);
    // Synchronise with index_query_t.
    const input = new MemoryWalker(new ArrayBuffer(4 + (bitmapCount * 2 + 3) * 4));
    input.writeUInt32LE(firstRank);
    for (const mode of modeTermBitmaps) {
        for (const bitmap of mode) {
            const ptr = queryRunner.malloc(bitmap.byteLength);
            queryRunnerMemory.forkAndJump(ptr).writeAll(new Uint8Array(bitmap));
            // WASM is LE.
            input
                .writeUInt32LE(bitmap.byteLength)
                .writeUInt32LE(ptr);
        }
        input.writeUInt32LE(0);
    }
    return new Uint8Array(input.buffer);
};
const executePostingsListQuery = (queryData) => {
    const inputPtr = queryRunner.index_query_malloc();
    queryRunnerMemory.forkAndJump(inputPtr).writeAll(queryData);
    const outputPtr = queryRunner.index_query(inputPtr);
    return outputPtr == 0 ? undefined : readResult(queryRunnerMemory.forkAndJump(outputPtr));
};
const getAsciiBytes = (str) => new Uint8Array(str.split('').map(c => c.charCodeAt(0)));
const COMMA = getAsciiBytes(',');
const handleSearch = async (url) => {
    // NOTE: Just because there are no valid words does not mean that there are no valid results.
    // For example, excluding an invalid word actually results in all entries matching.
    const query = parseQuery(url.searchParams.getAll('t'));
    if (!query) {
        return responseError('Malformed query');
    }
    const continuation = Math.max(0, Number.parseInt(url.searchParams.get('c') || '', 10) || 0);
    const termCount = query.reduce((count, modeTerms) => count + modeTerms.length, 0);
    if (termCount > MAX_QUERY_TERMS) {
        return responseError('Too many terms', 413);
    }
    const modeTermBitmaps = await findSerialisedTermBitmaps(query);
    console.log('Bit sets retrieved');
    // Handling non-existent terms:
    // - If REQUIRE, then immediately return zero results, regardless of other terms of any mode.
    // - If CONTAIN, then simply omit.
    // - If EXCLUDE, then it depends; if there are other terms of any mode, then simply omit. If there are no other terms of any mode, then return default results.
    if (modeTermBitmaps[0].some(bm => !bm)) {
        return responseNoResults();
    }
    modeTermBitmaps[1] = modeTermBitmaps[1].filter(bm => bm);
    modeTermBitmaps[2] = modeTermBitmaps[2].filter(bm => bm);
    let result;
    if (modeTermBitmaps.every(modeTerms => !modeTerms.length)) {
        console.log('Using default results');
        const after = continuation + MAX_RESULTS;
        result = {
            continuation: DOCUMENT_COUNT > after ? after : null,
            documents: Array.from({ length: MAX_RESULTS }, (_, i) => continuation + i).filter(docId => docId >= 0 && docId < DOCUMENT_COUNT),
            total: DOCUMENT_COUNT,
        };
    }
    else {
        queryRunner.reset();
        const indexQueryData = await buildIndexQuery(continuation, modeTermBitmaps);
        console.log('Query built');
        const maybeResult = await executePostingsListQuery(indexQueryData);
        if (!maybeResult) {
            throw new Error(`Failed to execute query`);
        }
        result = maybeResult;
        console.log('Query executed');
    }
    // We want to avoid JSON.{parse,stringify} as they take up a lot of CPU time and often cause timeout exceptions in CF Workers for large payloads.
    // So, we manually build our response with buffers, as that's how documents are stored.
    // The buffers represent parts of the UTF-8 encoded JSON serialised response bytes.
    const jsonResPrefix = getAsciiBytes(`{"total":${result.total},"continuation":${result.continuation},"results":[`);
    const jsonResSuffix = getAsciiBytes(`]}`);
    // Each document should be a JSON serialised value encoded in UTF-8.
    const documents = (await findAllInChunks('documents/', result.documents))
        .filter(exists)
        .map(d => new Uint8Array(d));
    console.log('Documents fetched');
    const stream = new TransformStream();
    const writer = stream.writable.getWriter();
    writer.write(jsonResPrefix);
    for (let i = 0; i < documents.length; i++) {
        if (i !== 0) {
            writer.write(COMMA);
        }
        writer.write(documents[i]);
    }
    writer.write(jsonResSuffix);
    writer.releaseLock();
    return new Response(stream.readable, {
        status: 200,
        headers: {
            'Content-Type': 'application/json',
            ...CORS_HEADERS,
        },
    });
};
const requestHandler = async (request) => {
    if (request.method == 'OPTIONS') {
        return responsePreflight();
    }
    const url = new URL(request.url);
    return url.pathname === '/search'
        ? handleSearch(url)
        : new Response(null, { status: 404 });
};
// See https://github.com/Microsoft/TypeScript/issues/14877.
self.addEventListener('fetch', event => {
    event.respondWith(requestHandler(event.request));
});