1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# Ragit Stress Test
#
# This is not for the testsuite (`tests.py all`) cuz it takes too long
# and is not for the CI/CD pipeline.
# If you have done a performance improvement, please execute this code and
# record the result in this file (`log` at below).
import json
import os
import shutil
import subprocess
import time
from utils import (
cargo_run,
clean,
goto_root,
mk_and_cd_tmp_dir,
write_string,
)
def timeit(name: str, f, result: dict):
start = time.time()
f()
end = time.time()
result[name] = int((end - start) * 1000)
print(json.dumps(result, indent=4))
def run():
goto_root()
mk_and_cd_tmp_dir()
result = {}
# I've first tried with the linux kernel, but it's too big. (maybe later!)
# I've second tried with the rust compiler, but `cargo run` behaves differently in the repository.
subprocess.run(["git", "clone", "https://github.com/git/git"])
os.chdir("git")
subprocess.run(["git", "checkout", "757161efcca150a9a96b312d9e780a071e601a03"]) # the newest commit at the time of writing
cargo_run(["init"])
cargo_run(["config", "--set", "model", "dummy"])
# NOTE: as of 285b54, we don't need this line anymore. but I'll just keep it.
write_string(".ragignore", ".git")
timeit("add all files", lambda: cargo_run(["add", "--all"]), result)
timeit("build without ii", lambda: cargo_run(["build"]), result)
timeit("tfidf without ii", lambda: cargo_run(["tfidf", "file system"]), result)
timeit("ii-build from scratch", lambda: cargo_run(["ii-build"]), result)
timeit("tfidf with ii", lambda: cargo_run(["tfidf", "file system"]), result)
shutil.rmtree(".ragit")
cargo_run(["init"])
cargo_run(["config", "--set", "model", "dummy"])
cargo_run(["add", "--all"])
cargo_run(["ii-build"])
timeit("build with incremental ii", lambda: cargo_run(["build"]), result)
timeit("ls-files reftable", lambda: cargo_run(["ls-files", "reftable"]), result)
timeit("ls-files reftable/iter.c", lambda: cargo_run(["ls-files", "reftable/iter.c"]), result)
timeit("ls-chunks reftable", lambda: cargo_run(["ls-chunks", "reftable"]), result)
timeit("ls-chunks reftable/iter.c", lambda: cargo_run(["ls-chunks", "reftable/iter.c"]), result)
clean()
return result
if __name__ == "__main__":
result = run()
print(json.dumps(result, indent=4))
log = [
# test run 1
# commit d3d834
# Apple Silicon M3 Pro
{
"add all files": 606,
"build without ii": 63020,
"tfidf without ii": 3113,
"ii-build from scratch": 58623,
"tfidf with ii": 887,
"build with incremental ii": 485101,
"ls-files reftable": 879,
"ls-files reftable/iter.c": 440,
"ls-chunks reftable": 436,
"ls-chunks reftable/iter.c": 425
},
# test run 2
# commit 11bcd4
# Apple Silicon M3 Pro
{
"add all files": 602,
"build without ii": 293189,
"tfidf without ii": 2187,
"ii-build from scratch": 61521,
"tfidf with ii": 911,
"build with incremental ii": 452734,
"ls-files reftable": 932,
"ls-files reftable/iter.c": 533,
"ls-chunks reftable": 464,
"ls-chunks reftable/iter.c": 504
}
]
'''
# Memo
NOTE: The test sample has 4583 files with 17095 chunks. It's big enough for a source-code RAG, but not for a general search engine.
- test run 1: first run
- An inverted index makes `tfidf` 3.5x faster. It's nice to see that ii is working.
- `build with incremental ii` is terribly slow. It's 3.98x slower than `build without ii` + `ii-build from scratch`.
- It's likely because `flush_ii_buffer` is called too frequently. It's flushed per file, which means, it's called 4583 times.
- `ii-build from scratch` flushes only 4 times.
- The other commands run in sub-second. It's good enough for cli users, but not for library users.
- test run 2: with multiprocess build
- Multiprocess workers are not as good as I've expected.
- `build without ii` takes 4.65 times longer than the first run. It's likely because of the overhead of the workers.
- `build with incremental ii` must have gotten better, which is what I've expected, but it hasn't. It's 7% faster, but it's not enough.
'''