import argparse
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
def load_data(args):
with open(args.i, "r") as f:
return json.load(f), args.i.split("/")[-1].split(".")[0]
def plot_cpt(args, data, filename):
data = data["compression"]
df = pd.DataFrame.from_dict(data, orient="index").reset_index()
df.rename(
columns={
"index": "Language",
"chars_per_token": "Characters per Token",
"num_chars": "Chars",
},
inplace=True,
)
df.sort_values(by="Chars", ascending=False, inplace=True)
plt.figure(figsize=(12, 6))
sns.barplot(
x="Language",
y="Characters per Token",
hue="Characters per Token",
data=df,
palette="viridis",
legend=False,
)
plt.gca().set_ylim(top=7)
offset = 0.35 plt.xticks(
ticks=np.arange(len(df["Language"])) + offset,
labels=df["Language"], rotation=45,
ha="right",
)
plt.title(f"Character per Token Ratio by Language ({filename})")
plt.tight_layout()
plt.xlabel("")
num_tokens = sum(data[lang]["num_tokens"] for lang in data)
num_chars = sum(data[lang]["num_chars"] for lang in data)
average = num_chars / num_tokens
plt.axhline(y=average, color="r", linestyle="--", label=f"Average: {average:.2f}")
humanevalx_languages = ["go", "python", "cpp", "java", "javascript"]
codegeex_languages = [
"jsx",
"javascript",
"typescript",
"java",
"python",
"html",
"cpp",
"c",
]
for subset, name, color in [
(humanevalx_languages, "HumanEvalX", "green"),
(codegeex_languages, "CodeGeeX", "blue"),
]:
if all(lang in data.keys() for lang in subset):
subset_num_tokens = sum(data[lang]["num_tokens"] for lang in subset)
subset_num_chars = sum(data[lang]["num_chars"] for lang in subset)
subset_average = subset_num_chars / subset_num_tokens
plt.axhline(
y=subset_average,
color=color,
linestyle="dotted",
label=f"{name} Average: {subset_average:.2f}",
)
plt.legend()
if args.cpt:
plt.savefig(args.cpt, dpi=300)
else:
plt.show()
def plot_freq(args, data, filename):
for config in [
(
"frequency_buckets",
"Token Frequency Distribution",
args.freq,
),
]:
key, title, out = config
pltdata = np.array(data[key], dtype=np.float64)
pltdata /= pltdata.sum()
pltdata *= 100
fig, ax = plt.subplots(figsize=(10, 6))
pltdata = pd.DataFrame(
{"Buckets": range(1, len(pltdata) + 1), "Frequency": pltdata}
)
sns.barplot(
x="Buckets",
y="Frequency",
data=pltdata,
ax=ax,
color="lightblue",
width=1.0,
)
ax.set_yscale("log")
ax.set_xticklabels([])
ax.set_ylim(0.0001, 100.0)
ax.yaxis.set_major_formatter("{x}%")
ax.set_ylabel("Rate of Occurrence (%)")
ax.set_title(f"{title} ({filename})")
plt.tight_layout()
plt.grid(linestyle="dotted")
if out:
plt.savefig(out, dpi=300)
else:
plt.show()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Plot the character per token ratio by language."
)
parser.add_argument(
"-i",
type=str,
required=True,
help="Input to the JSONL training/evaluation log file",
)
parser.add_argument(
"--cpt",
type=str,
help="Path to the output file for characters per token ratio",
)
parser.add_argument(
"--freq",
type=str,
help="Path to the output file for token frequency distribution",
)
args = parser.parse_args()
sns.set_theme(
style="whitegrid",
palette="pastel",
font_scale=1.2,
rc={"font.family": "Times New Roman"},
)
plt.rcParams.update({"font.size": 14})
data, filename = load_data(args)
plot_cpt(args, data, filename)
plot_freq(args, data, filename)