ragit 0.4.5

git-like rag pipeline
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
from abbrev import abbrev
from add_and_rm import add_and_rm
from add_and_rm2 import add_and_rm2
from archive import archive
from audit import audit
from erroneous_llm import erroneous_llm
from cannot_read_images import cannot_read_images
from cargo_tests import cargo_tests
from cargo_features import cargo_features
from cat_file import cat_file
from clean_up_erroneous_chunk import clean_up_erroneous_chunk
from cli import cli
from clone import clone
from clone_empty import clone_empty
from config import config
from csv_reader import csv_reader
from empty import empty
from end_to_end import end_to_end
from external_bases import external_bases
from extract_keywords import extract_keywords
from fetch_models import fetch_models
from generous_file_reader import generous_file_reader
from gh_issue_20 import gh_issue_20
from ignore import ignore
from ii import ii
from images import images
from images2 import images2
from images3 import images3
from korean import korean
from logs import logs
from ls import ls
from ls_dedup import ls_dedup
from ls_queries import ls_queries
from many_chunks import many_chunks
from many_jobs import many_jobs
from markdown_reader import markdown_reader
from merge import merge
from meta import meta
from migrate import migrate
from migrate2 import migrate2
from migrate3 import migrate3
from models_init import models_init, test_home_config_override
from orphan_process import orphan_process
from outside import outside
from pdf import pdf
from pdl import pdl
from pdl_escape import pdl_escape
from prompts import prompts
from pull import pull
from pull_ragithub import pull_ragithub
from query_options import query_options
from query_with_schema import query_with_schema
from ragit_api import ragit_api
from real_repos import real_repos
from real_repos_regression import real_repos_regression
from recover import recover
from retrieve_chunks import retrieve_chunks
from server import server
from server_ai_model import server_ai_model
from server_chat import server_chat
from server_file_tree import server_file_tree
from server_permission import server_permission
from subdir import subdir
from summary import summary
from svg import svg
from symlink import symlink
from tfidf import tfidf
from web_images import web_images
from write_lock import write_lock

from datetime import datetime
import os
from random import seed as rand_seed
import sys
from utils import (
    clean,
    clean_test_output,
    get_commit_hash,
    get_coverage,
    get_ragit_version,
    goto_root,
    recv_message,
    reset_message,
)

def get_platform_info() -> dict[str, str]:
    result = {}

    try:
        import subprocess
        result["cargo_version"] = subprocess.run(["cargo", "version"], capture_output=True, text=True, check=True).stdout.strip()

    except Exception as e:
        result["cargo_version"] = f"cannot get cargo_version: {e}"

    try:
        result["rustc_version"] = subprocess.run(["rustc", "--version"], capture_output=True, text=True, check=True).stdout.strip()

    except Exception as e:
        result["rustc_version"] = f"cannot get rustc_version: {e}"

    try:
        import platform
        result["python_version"] = platform.python_version()

    except Exception as e:
        result["python_version"] = f"cannot get python_version: {e}"

    try:
        result["platform"] = platform.platform()

    except Exception as e:
        result["platform"] = f"cannot get platform: {e}"

    return result

help_message = """
Commands
    end_to_end [model=dummy]    run `end_to_end` test
                                It simulates a basic workflow of ragit: init, add, build and
                                query. It runs on a real dataset: the documents of ragit.

    external_bases              run `external_bases` test
                                It creates bunch of knowledge-bases and run
                                `rag merge` on them. It also checks whether `rag tfidf`
                                can successfully retrieve a chunk from multiple
                                knowledge-bases.

    merge                       run `merge` test
                                It's like `external_bases` test, but with `--prefix` option.

    abbrev                      run `abbrev` test
                                It tests `--abbrev` option.

    add_and_rm                  run `add_and_rm` test
                                It runs tons of `rag add` and `rag rm` with different options.

    add_and_rm2                 run `add_and_rm2` test
                                Like `add_and_rm`, but it's more focused on `rag rm`.

    ignore                      run `ignore` test
                                It tests whether `rag add` respects `.ragignore` or
                                `.gitignore`.

    archive                     run `archive` test
                                It runs `archive-create` and `archive-extract` and check
                                if the extracted knowledge-base is identical to the original
                                one.

    recover                     run `recover` test
                                It checks whether 1) `rag check` fails on a broken
                                knowledge-base and 2) `rag check --recover` can fix a broken
                                knowledge-base.

    cannot_read_images          run `cannot_read_images` test
                                Some models can read images while some cannot. It tests what
                                happens when a model that cannot read images tries to manage
                                a knowledge-base with an image.

    clone                       run `clone` test
                                It creates a knowledge-base, pushes, clones and checks it.
                                It runs a local `ragit-server` in this repository.

    clone_empty                 run `clone_empty` test
                                It creates an empty repository in ragit-server, clones the
                                repository (which is not an error), adds some chunks to it,
                                and pushes it back to the server.

    pull                        run `pull` test
                                It creates a repository, pushes and pulls the repository and
                                see if it works.

    pull_ragithub               run `pull_ragithub` test
                                Most other tests, including `pull` runs ragit-server on
                                localhost, but it clones and pulls a knowledge-base from
                                ragithub. I want to make sure that the latest version is
                                always compatible with ragithub.

    server                      run `server` test
                                It tests endpoints related to a repository. It first pushes a
                                repository and fetches data (chunks, images, files, ...) from
                                the server.

    server_ai_model             run `server_ai_model` test
                                It tests endpoints related to ai models. It creates/updates and
                                reads ai models. Deletion is WIP.

    server_chat [model]         run `server_chat` test
                                It tests chat-related endpoints of ragit-server.

    server_file_tree            run `server_file_tree` test
                                It tests `/{user}/{repo}/file-content` api of ragit-server.

    server_permission           run `server_permission` test
                                It creates users and repositories with different permissions
                                and sends requests with/without api keys.

    fetch_models                run `fetch_models` test
                                It's like `server_ai_model`, but it updates local `models.json`
                                instead of server.

    query_options [model]       run `query_options` test
                                It tests various option flags of `rag query`.

    query_with_schema [model]   run `query_with_schema` test
                                It tests `--schema` flag of `rag query`.

    cli                         run `cli` test
                                It tests whether cli parser can parse the arguments correctly.
                                It also creates invalid inputs and see if the cli parser can
                                successfully reject them.

    outside                     run `outside`
                                It tests whether ragit can successfully reject files outside
                                a knowledge-base.

    migrate                     run `migrate` test
                                It checks out git to v 0.2.0, creates a knowledge-base, and
                                run `migrate` until the knowledge-base is migrated to the
                                newest version.
                                Since it runs `git checkout`, it may mess up your working
                                tree. If you have uncommitted changes, this test will do
                                nothing and fail.

    migrate2                    run `migrate2` test
                                Like `migrate`, but clones knowledge-bases from web instead
                                of creating a mock knowledge-base.

    migrate3                    run `migrate3` test
                                It creates knowledge-bases with different versions of ragit.
                                Then it makes sure that the versions can clone/push to the
                                latest version of ragit-server.

    config                      run `config` test
                                I have added new configs to ragit 0.3.5. And I want to see if
                                it's compatible with older versions.

    gh_issue_20                 run `gh_issue_20` test
                                https://github.com/baehyunsol/ragit/issues/20
                                It tests `-C` option.

    many_chunks                 run `many_chunks` test
                                It creates a lot of small files and see if ragit can
                                handle the files correctly. It also tests interrupting
                                `rag build`.

    erroneous_llm               run `erroneous_llm` test
                                It tests how `rag build` behaves when the LLM server is
                                unstable.

    many_jobs [model=dummy] [jobs=999]
                                run `many_jobs` test
                                `rag build` by default runs with many processes, and a
                                multi-process program may introduce many unexpected bugs.
                                It runs `rag build` with many processes and see if it works.
                                You'd better run it on a machine with many cores.

    ls                          run `ls` test
                                It runs `ls-chunks`, `ls-files`, `ls-images`, `ls-terms` and
                                `tfidf` with bunch of different options.

    ls_dedup                    run `ls_dedup` test
                                Some commands match chunk uid and file uid at the same time.
                                There's a possibility that a chunk and a file have the same
                                uid prefix. It tests whether ragit can deduplicate the same
                                uids in such cases.

    ls_queries                  run `ls_queries` test
                                It runs `ls-queries` multiple times with different options,
                                and check if it behaves as expected. It also tests other
                                commands that deal with query histories. I implemented an
                                extra test case instead of updating the `ls` test, because
                                the test is getting too long.

    logs [model]                run `logs` test
                                It checks if `rag config --set dump_log true` and
                                `rag gc --logs` work correctly.

    meta                        run `meta` test
                                It runs `rag meta`-family commands and see if it works.

    empty [model=dummy]         run `empty` test
                                It sees if ragit can handle an empty file correctly.

    symlink                     run `symlink` test
                                It tests whether ragit can handle symlinks correctly
                                without falling into infinite loops.

    ii                          run `ii` test
                                It creates an inverted index and test it.

    cat_file                    run `cat_file` test

    generous_file_reader        run `generous_file_reader` test
                                If some files are broken, ragit is supposed to
                                skip the broken files and continue processing the
                                valid files.

    clean_up_erroneous_chunk    run `clean_up_erroneous_chunk` test
                                It's an edge case in `generous_file_reader`.

    audit [model]               run `audit` test

    summary                     run `summary` test
                                It tests `rag summary`. It doesn't evaluate the quality of the
                                generated summary. It only checks whether the cli option of the
                                command work as expected. `end_to_end` test runs the command
                                and dumps the output, so you can check that out if you want to
                                see the quality of a summary.

    images                      run `images` test
                                It creates a markdown file with images and check
                                whether the markdown reader can parse the file
                                correctly.

    images2 [model]             run `images2` test
                                It tests whether models can generate image-description
                                files correctly.
                                NOTE: It uses the vision capability of the model.
                                      Make sure that the model has one.

    images3 [model]             run `images3` test
                                Other tests test images in markdown files, but they
                                don't test image file readers. It does.

    pdf [model]                 run `pdf` test
                                It tests the pdf reader.
                                You have to use a vision language model!

    pdl [model]                 run `pdl` test
                                It tests `rag pdl` command.

    pdl_escape                  run `pdl_escape` test
                                The pdl engine uses tera under the hood. The pdl engine
                                modifies tera's default escape function. It tests whether
                                the escape function works.

    svg [model]                 run `svg` test
                                It tests the svg reader.
                                You have to use a vision language model!

    web_images [model]          run `web_images` test
                                It tests whether ragit can fetch images from web.

    extract_keywords [model]    run `extract_keywords` test
                                It tests whether `rag extract-keywords` command works.

    orphan_process              run `orphan_process` test
                                It reproduces gh issue #9.
                                https://github.com/baehyunsol/ragit/issues/9

    write_lock                  run `write_lock` test
                                It reproduces gh issue #8.
                                https://github.com/baehyunsol/ragit/issues/8

    markdown_reader             run `markdown_reader` test
                                I have found many bugs in `markdown_reader_v0`. The bugs are
                                reproduced in this test. If you find a new one, please add that
                                to this test.

    csv_reader                  run `csv_reader` test

    real_repos [repo=all]       run `real_repos` test
                                It clones real git repos from the web and build knowledge-base
                                of the repos.
                                This is a very important test because it's the exact use
                                case of ragit that I have in my mind.
                                The test code uses the dummy model and only test file readers.
                                If you want to use real models, you have to run the main
                                function of `real_repos.py`.

    real_repos_regression       run `real_repos_regression` test
                                I ran `python3 tests.py real_repos` and was surprised to see
                                it throwing so many errors. Many of them were ragit's fault. So
                                I created this test, which tries to reproduce all the errors
                                found in the `real_repos` test.

    prompts [model=dummy]       run `prompts` test
                                It's the smallest set of commands that parses and executes all
                                the `.pdl` files in `prompts/` directory.

    retrieve_chunks [model]     run `retrieve_chunks` test
                                It tests the `rag retrieve-chunks` command.

    subdir                      run `subdir` test
                                It checks whether `ragit` is smart enough to find `.ragit/` in
                                any directory.

    tfidf                       run `tfidf` test
                                It creates bunch of lorem-ipsum files and see if `rag tfidf`
                                can retrieve files correctly. It also tests tfidf searches on
                                cjk strings.

    korean                      run `korean` test
                                It runs ragit with/without "korean" feature and makes sure that
                                the tokenizer behaves differently.

    ragit_api [model]           run `ragit_api` test
                                It asks "what's your name" to the model. It returns OK if the
                                api call was successful. It doesn't care about the content of
                                the model's response.

    cargo_tests                 run `cargo test` on all the crates
                                It also makes sure that cargo emits no warnings.

    cargo_features              run `cargo_features` test
                                Ragit has many cargo features. This test compiles ragit with
                                all the possible combinations of features and makes sure that
                                they all compile.

    models_init                 run `models_init` test
                                It tests the initialization of models.json and
                                model selection in api.json.

    all                         run all tests
                                It dumps the test result to `tests/results.json`.
"""

if __name__ == "__main__":
    no_clean = "--no-clean" in sys.argv
    args = [arg for arg in sys.argv if arg != "--no-clean"]
    seed = [arg for arg in args if arg.startswith("--seed=")]

    if len(seed) > 0:
        args = [arg for arg in args if arg not in seed]
        seed = int(seed[0].split("=")[1])

    else:
        now = datetime.now()
        seed = int(f"{now.year:04}{now.month:02}{now.day:02}{now.hour:02}{now.minute:02}{now.second:02}")

    command = args[1] if len(args) > 1 else None
    test_model = args[2] if len(args) > 2 else None
    rand_seed(seed)

    try:
        if command == "end_to_end":
            test_model = test_model or "dummy"
            end_to_end(test_model=test_model)

        elif command == "external_bases":
            external_bases()

        elif command == "merge":
            merge()

        elif command == "abbrev":
            abbrev()

        elif command == "add_and_rm":
            add_and_rm()

        elif command == "add_and_rm2":
            add_and_rm2()

        elif command == "ignore":
            ignore()

        elif command == "recover":
            recover()

        elif command == "cannot_read_images":
            cannot_read_images()

        elif command == "clone":
            clone()

        elif command == "clone_empty":
            clone_empty()

        elif command == "pull":
            pull()

        elif command == "pull_ragithub":
            pull_ragithub()

        elif command == "server":
            server()

        elif command == "server_ai_model":
            server_ai_model()

        elif command == "server_chat":
            if test_model is None or test_model == "dummy":
                print("Please specify which model to run the tests with. You cannot run this test with a dummy model.")
                sys.exit(1)

            server_chat(test_model=test_model)

        elif command == "server_file_tree":
            server_file_tree()

        elif command == "server_permission":
            server_permission()

        elif command == "fetch_models":
            fetch_models()

        elif command == "query_options":
            if test_model is None or test_model == "dummy":
                print("Please specify which model to run the tests with. You cannot run this test with a dummy model.")
                sys.exit(1)

            query_options(test_model=test_model)

        elif command == "query_with_schema":
            if test_model is None or test_model == "dummy":
                print("Please specify which model to run the tests with. You cannot run this test with a dummy model.")
                sys.exit(1)

            query_with_schema(test_model=test_model)

        elif command == "cli":
            cli()

        elif command == "outside":
            outside()

        elif command == "audit":
            if test_model is None or test_model == "dummy":
                print("Please specify which model to run the tests with. You cannot run this test with a dummy model.")
                sys.exit(1)

            audit(test_model=test_model)

        elif command == "summary":
            summary()

        elif command == "migrate":
            migrate()

        elif command == "migrate2":
            migrate2()

        elif command == "migrate3":
            migrate3()

        elif command == "config":
            config()

        elif command == "gh_issue_20":
            gh_issue_20()

        elif command == "archive":
            archive()

        elif command == "many_chunks":
            many_chunks()

        elif command == "erroneous_llm":
            erroneous_llm()

        elif command == "many_jobs":
            jobs = args[3] if len(args) > 3 else 999
            test_model = test_model if test_model else "dummy"
            many_jobs(test_model=test_model, jobs=jobs)

        elif command == "ls":
            ls()

        elif command == "ls_dedup":
            ls_dedup()

        elif command == "ls_queries":
            ls_queries()

        elif command == "logs":
            if test_model is None or test_model == "dummy":
                print("Please specify which model to run the tests with. You cannot run this test with a dummy model.")
                sys.exit(1)

            logs(test_model=test_model)

        elif command == "meta":
            meta()

        elif command == "symlink":
            symlink()

        elif command == "empty":
            test_model = test_model or "dummy"
            empty(test_model)

        elif command == "ii":
            ii()

        elif command == "cat_file":
            cat_file()

        elif command == "generous_file_reader":
            generous_file_reader()

        elif command == "clean_up_erroneous_chunk":
            clean_up_erroneous_chunk()

        elif command == "images":
            images()

        elif command == "images2":
            if test_model is None or test_model == "dummy":
                print("Please specify which model to run the tests with. You cannot run this test with a dummy model.")
                sys.exit(1)

            images2(test_model=test_model)

        elif command == "images3":
            if test_model is None or test_model == "dummy":
                print("Please specify which model to run the tests with. You cannot run this test with a dummy model.")
                sys.exit(1)

            images3(test_model=test_model)

        elif command == "pdf":
            if test_model is None or test_model == "dummy":
                print("Please specify which model to run the tests with. You cannot run this test with a dummy model.")
                sys.exit(1)

            pdf(test_model=test_model)

        elif command == "pdl":
            if test_model is None or test_model == "dummy":
                print("Please specify which model to run the tests with. You cannot run this test with a dummy model.")
                sys.exit(1)

            pdl(test_model=test_model)

        elif command == "pdl_escape":
            pdl_escape()

        elif command == "svg":
            if test_model is None or test_model == "dummy":
                print("Please specify which model to run the tests with. You cannot run this test with a dummy model.")
                sys.exit(1)

            svg(test_model=test_model)

        elif command == "web_images":
            if test_model is None or test_model == "dummy":
                print("Please specify which model to run the tests with. You cannot run this test with a dummy model.")
                sys.exit(1)

            web_images(test_model=test_model)

        elif command == "extract_keywords":
            if test_model is None:
                print("Please specify which model to run the tests with.")
                sys.exit(1)

            extract_keywords(test_model=test_model)

        elif command == "orphan_process":
            if test_model is None or test_model == "dummy":
                print("Please specify which model to run the tests with. You cannot run this test with a dummy model.")
                sys.exit(1)

            orphan_process(test_model=test_model)

        elif command == "write_lock":
            if test_model is None or test_model == "dummy":
                print("Please specify which model to run the tests with. You cannot run this test with a dummy model.")
                sys.exit(1)

            write_lock(test_model=test_model)

        elif command == "markdown_reader":
            markdown_reader()

        elif command == "csv_reader":
            csv_reader()

        elif command == "real_repos":
            repo = "all" if len(args) < 3 else args[2]
            real_repos(repo=repo)

        elif command == "real_repos_regression":
            real_repos_regression()

        elif command == "prompts":
            test_model = test_model or "dummy"
            prompts(test_model=test_model)

        elif command == "retrieve_chunks":
            if test_model is None or test_model == "dummy":
                print("Please specify which model to run the tests with. You cannot run this test with a dummy model.")
                sys.exit(1)

            retrieve_chunks(test_model=test_model)

        elif command == "subdir":
            subdir()

        elif command == "tfidf":
            tfidf()

        elif command == "korean":
            korean()

        elif command == "ragit_api":
            if test_model is None:
                print("Please specify which model to run the tests with.")
                sys.exit(1)

            ragit_api(test_model=test_model)

        elif command == "cargo_tests":
            cargo_tests()

        elif command == "cargo_features":
            cargo_features()
            
        elif command == "models_init":
            models_init()
            test_home_config_override()

        elif command == "all":
            import json
            import time
            import traceback

            tests = [
                ("cargo_tests", cargo_tests),
                ("cargo_features", cargo_features),
                ("abbrev", abbrev),
                ("add_and_rm", add_and_rm),
                ("add_and_rm2", add_and_rm2),
                ("ignore", ignore),
                ("recover", recover),
                ("cannot_read_images", cannot_read_images),
                ("clone", clone),
                ("clone_empty", clone_empty),
                ("pull", pull),
                ("pull_ragithub", pull_ragithub),
                ("server", server),
                ("server_ai_model", server_ai_model),
                ("server_permission", server_permission),
                ("server_file_tree", server_file_tree),
                ("fetch_models", fetch_models),
                ("cli", cli),
                ("outside", outside),
                ("archive", archive),
                ("many_chunks", many_chunks),
                ("erroneous_llm", erroneous_llm),
                ("pdl_escape", pdl_escape),
                ("many_jobs", many_jobs),
                ("ls", ls),
                ("ls_dedup", ls_dedup),
                ("ls_queries", ls_queries),
                ("meta", meta),
                ("symlink", symlink),
                ("gh_issue_20", gh_issue_20),
                ("ii", ii),
                ("cat_file", cat_file),
                ("generous_file_reader", generous_file_reader),
                ("clean_up_erroneous_chunk", clean_up_erroneous_chunk),
                ("images", images),
                ("markdown_reader", markdown_reader),
                ("csv_reader", csv_reader),
                ("real_repos", real_repos),
                ("real_repos_regression", real_repos_regression),
                ("subdir", subdir),
                ("tfidf", tfidf),
                ("korean", korean),
                ("merge", merge),
                ("summary", summary),
                ("external_bases", external_bases),
                ("end_to_end dummy", lambda: end_to_end(test_model="dummy")),
                ("end_to_end gpt-oss-20b-groq", lambda: end_to_end(test_model="gpt-oss-20b-groq")),
                ("end_to_end gpt-5-mini", lambda: end_to_end(test_model="gpt-5-mini")),
                ("audit gpt-oss-20b-groq", lambda: audit(test_model="gpt-oss-20b-groq")),
                ("logs gpt-oss-20b-groq", lambda: logs(test_model="gpt-oss-20b-groq")),
                ("prompts dummy", lambda: prompts(test_model="dummy")),
                ("prompts gpt-5-mini", lambda: prompts(test_model="gpt-5-mini")),
                ("prompts gpt-oss-20b-groq", lambda: prompts(test_model="gpt-oss-20b-groq")),
                ("prompts gemini-2.0-flash", lambda: prompts(test_model="gemini-2.0-flash")),
                ("prompts claude-4.5-haiku", lambda: prompts(test_model="claude-4.5-haiku")),
                ("retrieve_chunks claude-4.5-haiku", lambda: retrieve_chunks(test_model="claude-4.5-haiku")),
                ("retrieve_chunks gpt-oss-20b-groq", lambda: retrieve_chunks(test_model="gpt-oss-20b-groq")),
                ("empty dummy", lambda: empty(test_model="dummy")),
                ("empty gpt-oss-20b-groq", lambda: empty(test_model="gpt-oss-20b-groq")),
                ("server_chat gpt-oss-20b-groq", lambda: server_chat(test_model="gpt-oss-20b-groq")),
                ("server_chat gemini-2.0-flash", lambda: server_chat(test_model="gemini-2.0-flash")),
                ("images2 gpt-5-mini", lambda: images2(test_model="gpt-5-mini")),
                ("images3 gpt-5-mini", lambda: images3(test_model="gpt-5-mini")),
                ("pdl gpt-5-mini", lambda: pdl(test_model="gpt-5-mini")),
                ("pdf gpt-5-mini", lambda: pdf(test_model="gpt-5-mini")),
                ("svg gpt-5-mini", lambda: svg(test_model="gpt-5-mini")),
                ("web_images gpt-5-mini", lambda: web_images(test_model="gpt-5-mini")),
                ("images2 claude-4.5-haiku", lambda: images2(test_model="claude-4.5-haiku")),

                ("extract_keywords dummy", lambda: extract_keywords(test_model="dummy")),
                ("extract_keywords gpt-5-mini", lambda: extract_keywords(test_model="gpt-5-mini")),
                ("orphan_process gpt-oss-20b-groq", lambda: orphan_process(test_model="gpt-oss-20b-groq")),
                ("write_lock gpt-oss-20b-groq", lambda: write_lock(test_model="gpt-oss-20b-groq")),
                ("ragit_api command-r", lambda: ragit_api(test_model="command-r")),
                ("query_options gpt-oss-20b-groq", lambda: query_options(test_model="gpt-oss-20b-groq")),
                ("query_with_schema gpt-oss-20b-groq", lambda: query_with_schema(test_model="gpt-oss-20b-groq")),
                ("models_init", models_init),
                ("test_home_config_override", test_home_config_override),
                ("config", config),  # NOTE: it checkouts older versions of ragit
                ("migrate", migrate),
                ("migrate2", migrate2),
                ("migrate3", migrate3),
            ]
            started_at = datetime.now()
            has_error = False
            result = {
                "meta": {
                    "complete": False,
                    "started_at": str(started_at),
                    "commit": get_commit_hash(),
                    "platform": get_platform_info(),
                    "ragit_version": get_ragit_version(),
                    "rand_seed": seed,
                },
                "tests": {},
                "result": {
                    "total": len(tests),
                    "complete": 0,
                    "pass": 0,
                    "fail": 0,
                    "remaining": len(tests),
                },
            }

            with open("result.json", "w") as f:
                f.write(json.dumps(result, indent=4, ensure_ascii=True))

            for seq, (name, test) in enumerate(tests):
                print(f"running `{name}`...", flush=True)

                try:
                    start = time.time()
                    reset_message()
                    rand_seed(seed)
                    test()

                except Exception as e:
                    has_error = True
                    result["tests"][name] = {
                        "seq": seq,
                        "pass": False,
                        "error": clean_test_output(str(e) + "\n" + traceback.format_exc()),
                        "elapsed_ms": int((time.time() - start) * 1000),
                    }
                    result["result"]["fail"] += 1

                else:
                    result["tests"][name] = {
                        "seq": seq,
                        "pass": True,
                        "error": None,
                        "elapsed_ms": int((time.time() - start) * 1000),
                    }
                    result["result"]["pass"] += 1

                finally:
                    result["result"]["complete"] += 1
                    result["result"]["remaining"] -= 1
                    result["coverage"] = get_coverage()

                    # `except Exception as e` does not catch all the exceptions
                    if name in result["tests"]:
                        result["tests"][name]["message"] = recv_message()

                    if not no_clean:
                        try:
                            clean()

                        # `clean()` may die. For example, some tests may spawn a process and dies while
                        # its children are alive. The children are still writing something to the tmp dir
                        # and it would mess up `shutil.rmtree()`.
                        except Exception as e:
                            result["tests"][name]["cleanup_error"] = str(e) + "\n" + traceback.format_exc()

                    goto_root()
                    os.chdir("tests")

                    with open("result.json", "w") as f:
                        f.write(json.dumps(result, indent=4, ensure_ascii=True))

            ended_at = datetime.now()
            result["meta"]["ended_at"] = str(ended_at)
            result["meta"]["elapsed_ms"] = (ended_at - started_at).seconds * 1000 + (ended_at - started_at).microseconds // 1000
            result["meta"]["complete"] = True
            goto_root()
            os.chdir("tests")
            result = json.dumps(result, indent=4, ensure_ascii=True)
            print(result)

            with open("result.json", "w") as f:
                f.write(result)

            if has_error:
                sys.exit(1)

        else:
            print("invalid command:", command)
            print(help_message)

    finally:
        if not no_clean:
            clean()