๐Ÿ“ฆ orlp / foldhash

๐Ÿ“„ format-bench-results.py ยท 62 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62import sys
import polars as pl

MAP_SIZE = 1000
SET_BUILD_FACTOR = 10 * MAP_SIZE

distr_order = [
    "u32",
    "u32pair",
    "u64",
    "u64lobits",
    "u64hibits",
    "u64pair",
    "ipv4",
    "ipv6",
    "rgba",
    "strenglishword",
    "struuid",
    "strurl",
    "strdate",
    "accesslog",
    "kilobyte",
    "tenkilobyte",
]

name_repl = {
    "foldhash-fast": "foldhash-f",
    "foldhash-quality": "foldhash-q",
}

bench_order = ["hashonly", "lookupmiss", "lookuphit", "setbuild"]
hash_order = ["foldhash-f", "foldhash-q", "fxhash", "ahash", "siphash"]

distr_order_df = pl.DataFrame({"distr": distr_order, "distr_order_idx": range(len(distr_order))})
bench_order_df = pl.DataFrame({"bench": bench_order, "bench_order_idx": range(len(bench_order))})
hash_order_df = pl.DataFrame({"hash": hash_order, "hash_order_idx": range(len(hash_order))})

df = (
    pl.scan_csv(sys.argv[1])
        .with_columns(pl.col.hash.replace(name_repl))
        .with_columns(ns = pl.col.ns / pl.when(pl.col.bench == "setbuild").then(SET_BUILD_FACTOR).otherwise(1))
        .join(distr_order_df.lazy(), on="distr")
        .join(bench_order_df.lazy(), on="bench")
        .join(hash_order_df.lazy(), on="hash")
        .sort(["distr_order_idx", "distr", "bench_order_idx", "hash_order_idx"])
        .select(pl.col.distr, pl.col.bench, pl.col.hash, pl.col.ns)
        .collect()
)

with pl.Config(tbl_rows=-1, float_precision=2, tbl_cell_alignment="RIGHT"):
    print(df.pivot("hash", values="ns"))
    print(
        df
            .with_columns(rank = pl.col.ns.rank().over("distr", "bench"))
            .group_by("hash", maintain_order=True)
            .agg(
                avg_rank = pl.col.rank.mean(),
                geometric_mean = pl.col.ns.log().mean().exp()
            )
            .transpose(include_header=True, header_name="metric", column_names="hash")
    )