libduckdb-sys 1.10503.0

#!/usr/bin/env python3

import gzip
import json
import os
import shutil
import stat
import sys
import tarfile
import tempfile
from pathlib import Path

SCRIPT_DIR = Path(__file__).resolve().parent

DUCKDB_SCRIPTS_DIR = SCRIPT_DIR / "duckdb-sources" / "scripts"
TARGET_DIR = SCRIPT_DIR / "duckdb"
ARCHIVE_PATH = SCRIPT_DIR / "duckdb.tar.gz"
# Temporary file generated by DuckDB's package_build.py in duckdb-sources.
# Cleaned up after each call because build_package already copies it into TARGET_DIR.
PACKAGE_BUILD_LOADER_PATH = (
    SCRIPT_DIR / "duckdb-sources" / "generated_extension_loader_package_build.cpp"
)
SRC_DIR = SCRIPT_DIR / "src"

# List of extensions' sources to grab. Technically, these sources will be compiled
# but not included in the final build unless they're explicitly enabled.
EXTENSIONS = ["core_functions", "parquet", "json"]

# Clear the duckdb directory
try:
    shutil.rmtree(TARGET_DIR)
except FileNotFoundError:
    pass

TARGET_DIR.mkdir()

sys.path.append(str(DUCKDB_SCRIPTS_DIR))
import package_build


def get_sources(extensions, default_linked_extensions=None):
    kwargs = {}
    if default_linked_extensions is not None:
        kwargs["default_linked_extensions"] = default_linked_extensions

    (source_list, include_list, _) = package_build.build_package(
        str(TARGET_DIR), extensions, False, **kwargs
    )
    PACKAGE_BUILD_LOADER_PATH.unlink(missing_ok=True)

    # Remove the absolute prefix on the files (some get generated with it)
    script_dir_prefix = f"{SCRIPT_DIR}{os.path.sep}"
    source_list = [
        x[len(script_dir_prefix) :] if x.startswith(script_dir_prefix) else x
        for x in source_list
    ]

    return set(source_list), set(include_list)


base_source_list, base_include_list = get_sources([])

extension_sources = {}
for e in EXTENSIONS:
    source_list, include_list = get_sources([e])
    extension_sources[e] = {
        "cpp_files": sorted(source_list - base_source_list),
        "include_dirs": sorted(include_list - base_include_list),
    }

# Regenerate generated_extension_loader_package_build.cpp with ALL extension sources,
# but default none of them to linked. build_bundled_cc.rs enables the Cargo
# feature-selected subset with DUCKDB_EXTENSION_<NAME>_LINKED definitions.
# The loop above calls get_sources() per-extension and cleans up package_build.py's
# temporary loader each time. This final call still ensures the copied loader in
# TARGET_DIR registers every extension.
get_sources(EXTENSIONS, default_linked_extensions=[])

manifest = {
    "base": {
        "cpp_files": sorted(base_source_list),
        "include_dirs": sorted(base_include_list),
    },
    "extensions": extension_sources,
}

with (TARGET_DIR / "manifest.json").open("w") as f:
    json.dump(manifest, f, indent=2, sort_keys=True)
    f.write("\n")


def iter_archive_paths(root):
    yield root
    for dirpath, dirnames, filenames in os.walk(root):
        dirpath = Path(dirpath)
        # os.walk uses dirnames for traversal order; sort it in place for stability.
        dirnames.sort()
        filenames.sort()
        for dirname in dirnames:
            yield dirpath / dirname
        for filename in filenames:
            yield dirpath / filename


def normalized_tarinfo(path):
    archive_name = path.relative_to(SCRIPT_DIR).as_posix()
    tarinfo = tarfile.TarInfo(archive_name)
    tarinfo.uid = 0
    tarinfo.gid = 0
    tarinfo.uname = ""
    tarinfo.gname = ""
    tarinfo.mtime = 0
    tarinfo.pax_headers = {}

    if path.is_symlink():
        tarinfo.type = tarfile.SYMTYPE
        tarinfo.mode = 0o777
        tarinfo.linkname = str(path.readlink())
    elif path.is_dir():
        tarinfo.type = tarfile.DIRTYPE
        tarinfo.mode = 0o755
    elif path.is_file():
        tarinfo.type = tarfile.REGTYPE
        tarinfo.mode = 0o644
        tarinfo.size = path.stat().st_size
    else:
        filemode = stat.filemode(path.lstat().st_mode)
        raise RuntimeError(f"unsupported archive entry type: {path} ({filemode})")

    return tarinfo


# Keep archive generation in Python instead of shelling out to `tar -czf`.
# BSD tar, the default on macOS, can suppress the gzip timestamp but does not
# provide GNU tar's --mtime and --sort controls. Using tarfile
# also avoids platform-specific metadata such as AppleDouble entries.
def write_archive_to(archive_path):
    with archive_path.open("wb") as archive_file:
        with gzip.GzipFile(
            filename="", mode="wb", fileobj=archive_file, mtime=0
        ) as gzip_file:
            with tarfile.open(
                fileobj=gzip_file, mode="w", format=tarfile.PAX_FORMAT
            ) as tar:
                for path in iter_archive_paths(TARGET_DIR):
                    tarinfo = normalized_tarinfo(path)
                    if tarinfo.isfile():
                        with path.open("rb") as fileobj:
                            tar.addfile(tarinfo, fileobj)
                    else:
                        tar.addfile(tarinfo)


def replace_archive_if_changed():
    temp_fd, temp_archive_name = tempfile.mkstemp(
        dir=SCRIPT_DIR, prefix=f".{ARCHIVE_PATH.name}."
    )
    os.close(temp_fd)
    temp_archive_path = Path(temp_archive_name)
    try:
        write_archive_to(temp_archive_path)
        # Cargo tracks rerun-if-changed inputs by filesystem timestamp. Keep the
        # existing archive in place when bytes match so a no-op source update
        # does not trigger a bundled rebuild.
        if (
            ARCHIVE_PATH.exists()
            and temp_archive_path.read_bytes() == ARCHIVE_PATH.read_bytes()
        ):
            temp_archive_path.unlink()
            return
        temp_archive_path.replace(ARCHIVE_PATH)
    except Exception:
        temp_archive_path.unlink(missing_ok=True)
        raise


replace_archive_if_changed()