warcat 0.3.4

Command-line tool and library for handling Web ARChive (WARC) files
Documentation
# Example on how to read WARC files.
import subprocess

import message


def main():
    # Launch the warcat program. The options provided will tell it to write
    # JSON as a line to standard out.
    # Ensure you have warcat on the search path or adjust the path as needed.
    with subprocess.Popen(
        [
            "warcat",
            "export",
            "--input=examples/example.warc",
            "--format=jsonl",
        ],
        stdout=subprocess.PIPE,
    ) as process:
        # Decode each message by using our helper module.
        for msg in message.decode(process.stdout):
            if isinstance(msg, message.Header):
                # We decoded the start of the record.
                print(msg.fields)
            elif isinstance(msg, message.BlockChunk):
                # We decoded the body of the record.
                print(len(msg.data))
            elif isinstance(msg, message.BlockEnd):
                # The end of the record was reached.
                print("---")


if __name__ == "__main__":
    main()