mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-08-14 04:17:53 -04:00
compare-commits.sh: support both llama-bench and test-backend-ops (#14392)
* compare-commits.sh: support both llama-bench and test-backend-ops Signed-off-by: Xiaodong Ye <yeahdongcn@gmail.com> * Speed up the build by specifying -j 12 Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * Remove build_number from test-backend-ops db Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * Apply suggestion from @JohannesGaessler Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * Refine tool selection logic Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * Address review comments Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> --------- Signed-off-by: Xiaodong Ye <yeahdongcn@gmail.com> Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
This commit is contained in:
@@ -1,19 +1,41 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
if [ $# -lt 2 ]; then
|
if [ $# -lt 2 ]; then
|
||||||
echo "usage: ./scripts/compare-commits.sh <commit1> <commit2> [additional llama-bench arguments]"
|
echo "usage: ./scripts/compare-commits.sh <commit1> <commit2> [tool] [additional arguments]"
|
||||||
|
echo " tool: 'llama-bench' (default) or 'test-backend-ops'"
|
||||||
|
echo " additional arguments: passed to the selected tool"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
set -x
|
set -x
|
||||||
|
|
||||||
|
# Parse arguments
|
||||||
|
commit1=$1
|
||||||
|
commit2=$2
|
||||||
|
tool=${3:-llama-bench}
|
||||||
|
additional_args="${@:4}"
|
||||||
|
|
||||||
|
# Validate tool argument
|
||||||
|
if [ "$tool" != "llama-bench" ] && [ "$tool" != "test-backend-ops" ]; then
|
||||||
|
echo "Error: tool must be 'llama-bench' or 'test-backend-ops'"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
# verify at the start that the compare script has all the necessary dependencies installed
|
# verify at the start that the compare script has all the necessary dependencies installed
|
||||||
./scripts/compare-llama-bench.py --check
|
./scripts/compare-llama-bench.py --check
|
||||||
|
|
||||||
bench_args="${@:3}"
|
if [ "$tool" = "llama-bench" ]; then
|
||||||
|
db_file="llama-bench.sqlite"
|
||||||
|
target="llama-bench"
|
||||||
|
run_args="-o sql -oe md $additional_args"
|
||||||
|
else # test-backend-ops
|
||||||
|
db_file="test-backend-ops.sqlite"
|
||||||
|
target="test-backend-ops"
|
||||||
|
run_args="perf --output sql $additional_args"
|
||||||
|
fi
|
||||||
|
|
||||||
rm -f llama-bench.sqlite > /dev/null
|
rm -f "$db_file" > /dev/null
|
||||||
|
|
||||||
# to test a backend, call the script with the corresponding environment variable (e.g. GGML_CUDA=1 ./scripts/compare-commits.sh ...)
|
# to test a backend, call the script with the corresponding environment variable (e.g. GGML_CUDA=1 ./scripts/compare-commits.sh ...)
|
||||||
if [ -n "$GGML_CUDA" ]; then
|
if [ -n "$GGML_CUDA" ]; then
|
||||||
@@ -25,14 +47,14 @@ dir="build-bench"
|
|||||||
function run {
|
function run {
|
||||||
rm -fr ${dir} > /dev/null
|
rm -fr ${dir} > /dev/null
|
||||||
cmake -B ${dir} -S . ${CMAKE_OPTS} > /dev/null
|
cmake -B ${dir} -S . ${CMAKE_OPTS} > /dev/null
|
||||||
cmake --build ${dir} -t llama-bench > /dev/null
|
cmake --build ${dir} -t $target -j $(nproc) > /dev/null
|
||||||
${dir}/bin/llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
|
${dir}/bin/$target $run_args | sqlite3 "$db_file"
|
||||||
}
|
}
|
||||||
|
|
||||||
git checkout $1 > /dev/null
|
git checkout $commit1 > /dev/null
|
||||||
run
|
run
|
||||||
|
|
||||||
git checkout $2 > /dev/null
|
git checkout $commit2 > /dev/null
|
||||||
run
|
run
|
||||||
|
|
||||||
./scripts/compare-llama-bench.py -b $1 -c $2
|
./scripts/compare-llama-bench.py -b $commit1 -c $commit2 --tool $tool -i "$db_file"
|
||||||
|
@@ -1,16 +1,16 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import logging
|
|
||||||
import argparse
|
import argparse
|
||||||
import heapq
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
from glob import glob
|
|
||||||
import sqlite3
|
|
||||||
import json
|
|
||||||
import csv
|
import csv
|
||||||
from typing import Optional, Union
|
import heapq
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
import sys
|
||||||
from collections.abc import Iterator, Sequence
|
from collections.abc import Iterator, Sequence
|
||||||
|
from glob import glob
|
||||||
|
from typing import Any, Optional, Union
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import git
|
import git
|
||||||
@@ -23,7 +23,7 @@ except ImportError as e:
|
|||||||
logger = logging.getLogger("compare-llama-bench")
|
logger = logging.getLogger("compare-llama-bench")
|
||||||
|
|
||||||
# All llama-bench SQL fields
|
# All llama-bench SQL fields
|
||||||
DB_FIELDS = [
|
LLAMA_BENCH_DB_FIELDS = [
|
||||||
"build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename",
|
"build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename",
|
||||||
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
|
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
|
||||||
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
|
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
|
||||||
@@ -33,7 +33,7 @@ DB_FIELDS = [
|
|||||||
"test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
|
"test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
|
||||||
]
|
]
|
||||||
|
|
||||||
DB_TYPES = [
|
LLAMA_BENCH_DB_TYPES = [
|
||||||
"TEXT", "INTEGER", "TEXT", "TEXT", "TEXT", "TEXT",
|
"TEXT", "INTEGER", "TEXT", "TEXT", "TEXT", "TEXT",
|
||||||
"TEXT", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER",
|
"TEXT", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER",
|
||||||
"TEXT", "INTEGER", "INTEGER", "TEXT", "TEXT", "INTEGER",
|
"TEXT", "INTEGER", "INTEGER", "TEXT", "TEXT", "INTEGER",
|
||||||
@@ -42,20 +42,41 @@ DB_TYPES = [
|
|||||||
"INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER",
|
"INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER",
|
||||||
"TEXT", "INTEGER", "INTEGER", "REAL", "REAL",
|
"TEXT", "INTEGER", "INTEGER", "REAL", "REAL",
|
||||||
]
|
]
|
||||||
assert len(DB_FIELDS) == len(DB_TYPES)
|
|
||||||
|
|
||||||
# Properties by which to differentiate results per commit:
|
# All test-backend-ops SQL fields
|
||||||
KEY_PROPERTIES = [
|
TEST_BACKEND_OPS_DB_FIELDS = [
|
||||||
|
"test_time", "build_commit", "backend_name", "op_name", "op_params", "test_mode",
|
||||||
|
"supported", "passed", "error_message", "time_us", "flops", "bandwidth_gb_s",
|
||||||
|
"memory_kb", "n_runs"
|
||||||
|
]
|
||||||
|
|
||||||
|
TEST_BACKEND_OPS_DB_TYPES = [
|
||||||
|
"TEXT", "TEXT", "TEXT", "TEXT", "TEXT", "TEXT",
|
||||||
|
"INTEGER", "INTEGER", "TEXT", "REAL", "REAL", "REAL",
|
||||||
|
"INTEGER", "INTEGER"
|
||||||
|
]
|
||||||
|
|
||||||
|
assert len(LLAMA_BENCH_DB_FIELDS) == len(LLAMA_BENCH_DB_TYPES)
|
||||||
|
assert len(TEST_BACKEND_OPS_DB_FIELDS) == len(TEST_BACKEND_OPS_DB_TYPES)
|
||||||
|
|
||||||
|
# Properties by which to differentiate results per commit for llama-bench:
|
||||||
|
LLAMA_BENCH_KEY_PROPERTIES = [
|
||||||
"cpu_info", "gpu_info", "backends", "n_gpu_layers", "tensor_buft_overrides", "model_filename", "model_type",
|
"cpu_info", "gpu_info", "backends", "n_gpu_layers", "tensor_buft_overrides", "model_filename", "model_type",
|
||||||
"n_batch", "n_ubatch", "embeddings", "cpu_mask", "cpu_strict", "poll", "n_threads", "type_k", "type_v",
|
"n_batch", "n_ubatch", "embeddings", "cpu_mask", "cpu_strict", "poll", "n_threads", "type_k", "type_v",
|
||||||
"use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen", "n_depth"
|
"use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen", "n_depth"
|
||||||
]
|
]
|
||||||
|
|
||||||
# Properties that are boolean and are converted to Yes/No for the table:
|
# Properties by which to differentiate results per commit for test-backend-ops:
|
||||||
BOOL_PROPERTIES = ["embeddings", "cpu_strict", "use_mmap", "no_kv_offload", "flash_attn"]
|
TEST_BACKEND_OPS_KEY_PROPERTIES = [
|
||||||
|
"backend_name", "op_name", "op_params", "test_mode"
|
||||||
|
]
|
||||||
|
|
||||||
# Header names for the table:
|
# Properties that are boolean and are converted to Yes/No for the table:
|
||||||
PRETTY_NAMES = {
|
LLAMA_BENCH_BOOL_PROPERTIES = ["embeddings", "cpu_strict", "use_mmap", "no_kv_offload", "flash_attn"]
|
||||||
|
TEST_BACKEND_OPS_BOOL_PROPERTIES = ["supported", "passed"]
|
||||||
|
|
||||||
|
# Header names for the table (llama-bench):
|
||||||
|
LLAMA_BENCH_PRETTY_NAMES = {
|
||||||
"cpu_info": "CPU", "gpu_info": "GPU", "backends": "Backends", "n_gpu_layers": "GPU layers",
|
"cpu_info": "CPU", "gpu_info": "GPU", "backends": "Backends", "n_gpu_layers": "GPU layers",
|
||||||
"tensor_buft_overrides": "Tensor overrides", "model_filename": "File", "model_type": "Model", "model_size": "Model size [GiB]",
|
"tensor_buft_overrides": "Tensor overrides", "model_filename": "File", "model_type": "Model", "model_size": "Model size [GiB]",
|
||||||
"model_n_params": "Num. of par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size", "embeddings": "Embeddings",
|
"model_n_params": "Num. of par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size", "embeddings": "Embeddings",
|
||||||
@@ -64,21 +85,42 @@ PRETTY_NAMES = {
|
|||||||
"flash_attn": "FlashAttention",
|
"flash_attn": "FlashAttention",
|
||||||
}
|
}
|
||||||
|
|
||||||
DEFAULT_SHOW = ["model_type"] # Always show these properties by default.
|
# Header names for the table (test-backend-ops):
|
||||||
DEFAULT_HIDE = ["model_filename"] # Always hide these properties by default.
|
TEST_BACKEND_OPS_PRETTY_NAMES = {
|
||||||
|
"backend_name": "Backend", "op_name": "GGML op", "op_params": "Op parameters", "test_mode": "Mode",
|
||||||
|
"supported": "Supported", "passed": "Passed", "error_message": "Error",
|
||||||
|
"flops": "FLOPS", "bandwidth_gb_s": "Bandwidth (GB/s)", "memory_kb": "Memory (KB)", "n_runs": "Runs"
|
||||||
|
}
|
||||||
|
|
||||||
|
DEFAULT_SHOW_LLAMA_BENCH = ["model_type"] # Always show these properties by default.
|
||||||
|
DEFAULT_HIDE_LLAMA_BENCH = ["model_filename"] # Always hide these properties by default.
|
||||||
|
|
||||||
|
DEFAULT_SHOW_TEST_BACKEND_OPS = ["backend_name", "op_name"] # Always show these properties by default.
|
||||||
|
DEFAULT_HIDE_TEST_BACKEND_OPS = ["error_message"] # Always hide these properties by default.
|
||||||
|
|
||||||
GPU_NAME_STRIP = ["NVIDIA GeForce ", "Tesla ", "AMD Radeon "] # Strip prefixes for smaller tables.
|
GPU_NAME_STRIP = ["NVIDIA GeForce ", "Tesla ", "AMD Radeon "] # Strip prefixes for smaller tables.
|
||||||
MODEL_SUFFIX_REPLACE = {" - Small": "_S", " - Medium": "_M", " - Large": "_L"}
|
MODEL_SUFFIX_REPLACE = {" - Small": "_S", " - Medium": "_M", " - Large": "_L"}
|
||||||
|
|
||||||
DESCRIPTION = """Creates tables from llama-bench data written to multiple JSON/CSV files, a single JSONL file or SQLite database. Example usage (Linux):
|
DESCRIPTION = """Creates tables from llama-bench or test-backend-ops data written to multiple JSON/CSV files, a single JSONL file or SQLite database. Example usage (Linux):
|
||||||
|
|
||||||
|
For llama-bench:
|
||||||
$ git checkout master
|
$ git checkout master
|
||||||
$ make clean && make llama-bench
|
$ cmake -B ${BUILD_DIR} ${CMAKE_OPTS} && cmake --build ${BUILD_DIR} -t llama-bench -j $(nproc)
|
||||||
$ ./llama-bench -o sql | sqlite3 llama-bench.sqlite
|
$ ./llama-bench -o sql | sqlite3 llama-bench.sqlite
|
||||||
$ git checkout some_branch
|
$ git checkout some_branch
|
||||||
$ make clean && make llama-bench
|
$ cmake -B ${BUILD_DIR} ${CMAKE_OPTS} && cmake --build ${BUILD_DIR} -t llama-bench -j $(nproc)
|
||||||
$ ./llama-bench -o sql | sqlite3 llama-bench.sqlite
|
$ ./llama-bench -o sql | sqlite3 llama-bench.sqlite
|
||||||
$ ./scripts/compare-llama-bench.py
|
$ ./scripts/compare-llama-bench.py
|
||||||
|
|
||||||
|
For test-backend-ops:
|
||||||
|
$ git checkout master
|
||||||
|
$ cmake -B ${BUILD_DIR} ${CMAKE_OPTS} && cmake --build ${BUILD_DIR} -t test-backend-ops -j $(nproc)
|
||||||
|
$ ./test-backend-ops perf --output sql | sqlite3 test-backend-ops.sqlite
|
||||||
|
$ git checkout some_branch
|
||||||
|
$ cmake -B ${BUILD_DIR} ${CMAKE_OPTS} && cmake --build ${BUILD_DIR} -t test-backend-ops -j $(nproc)
|
||||||
|
$ ./test-backend-ops perf --output sql | sqlite3 test-backend-ops.sqlite
|
||||||
|
$ ./scripts/compare-llama-bench.py --tool test-backend-ops -i test-backend-ops.sqlite
|
||||||
|
|
||||||
Performance numbers from multiple runs per commit are averaged WITHOUT being weighted by the --repetitions parameter of llama-bench.
|
Performance numbers from multiple runs per commit are averaged WITHOUT being weighted by the --repetitions parameter of llama-bench.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -96,6 +138,13 @@ help_c = (
|
|||||||
"Defaults to the non-master commit for which llama-bench was run most recently."
|
"Defaults to the non-master commit for which llama-bench was run most recently."
|
||||||
)
|
)
|
||||||
parser.add_argument("-c", "--compare", help=help_c)
|
parser.add_argument("-c", "--compare", help=help_c)
|
||||||
|
help_t = (
|
||||||
|
"The tool whose data is being compared. "
|
||||||
|
"Either 'llama-bench' or 'test-backend-ops'. "
|
||||||
|
"This determines the database schema and comparison logic used. "
|
||||||
|
"If left unspecified, try to determine from the input file."
|
||||||
|
)
|
||||||
|
parser.add_argument("-t", "--tool", help=help_t, default=None, choices=[None, "llama-bench", "test-backend-ops"])
|
||||||
help_i = (
|
help_i = (
|
||||||
"JSON/JSONL/SQLite/CSV files for comparing commits. "
|
"JSON/JSONL/SQLite/CSV files for comparing commits. "
|
||||||
"Specify multiple times to use multiple input files (JSON/CSV only). "
|
"Specify multiple times to use multiple input files (JSON/CSV only). "
|
||||||
@@ -114,7 +163,8 @@ parser.add_argument("-o", "--output", help=help_o, default="pipe")
|
|||||||
help_s = (
|
help_s = (
|
||||||
"Columns to add to the table. "
|
"Columns to add to the table. "
|
||||||
"Accepts a comma-separated list of values. "
|
"Accepts a comma-separated list of values. "
|
||||||
f"Legal values: {', '.join(KEY_PROPERTIES[:-3])}. "
|
f"Legal values for test-backend-ops: {', '.join(TEST_BACKEND_OPS_KEY_PROPERTIES)}. "
|
||||||
|
f"Legal values for llama-bench: {', '.join(LLAMA_BENCH_KEY_PROPERTIES[:-3])}. "
|
||||||
"Defaults to model name (model_type) and CPU and/or GPU name (cpu_info, gpu_info) "
|
"Defaults to model name (model_type) and CPU and/or GPU name (cpu_info, gpu_info) "
|
||||||
"plus any column where not all data points are the same. "
|
"plus any column where not all data points are the same. "
|
||||||
"If the columns are manually specified, then the results for each unique combination of the "
|
"If the columns are manually specified, then the results for each unique combination of the "
|
||||||
@@ -142,8 +192,14 @@ if unknown_args:
|
|||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
input_file = known_args.input
|
input_file = known_args.input
|
||||||
if not input_file and os.path.exists("./llama-bench.sqlite"):
|
tool = known_args.tool
|
||||||
|
|
||||||
|
if not input_file:
|
||||||
|
if tool == "llama-bench" and os.path.exists("./llama-bench.sqlite"):
|
||||||
input_file = ["llama-bench.sqlite"]
|
input_file = ["llama-bench.sqlite"]
|
||||||
|
elif tool == "test-backend-ops" and os.path.exists("./test-backend-ops.sqlite"):
|
||||||
|
input_file = ["test-backend-ops.sqlite"]
|
||||||
|
|
||||||
if not input_file:
|
if not input_file:
|
||||||
sqlite_files = glob("*.sqlite")
|
sqlite_files = glob("*.sqlite")
|
||||||
if len(sqlite_files) == 1:
|
if len(sqlite_files) == 1:
|
||||||
@@ -161,14 +217,23 @@ class LlamaBenchData:
|
|||||||
build_len_max: int
|
build_len_max: int
|
||||||
build_len: int = 8
|
build_len: int = 8
|
||||||
builds: list[str] = []
|
builds: list[str] = []
|
||||||
check_keys = set(KEY_PROPERTIES + ["build_commit", "test_time", "avg_ts"])
|
tool: str = "llama-bench" # Tool type: "llama-bench" or "test-backend-ops"
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, tool: str = "llama-bench"):
|
||||||
|
self.tool = tool
|
||||||
try:
|
try:
|
||||||
self.repo = git.Repo(".", search_parent_directories=True)
|
self.repo = git.Repo(".", search_parent_directories=True)
|
||||||
except git.InvalidGitRepositoryError:
|
except git.InvalidGitRepositoryError:
|
||||||
self.repo = None
|
self.repo = None
|
||||||
|
|
||||||
|
# Set schema-specific properties based on tool
|
||||||
|
if self.tool == "llama-bench":
|
||||||
|
self.check_keys = set(LLAMA_BENCH_KEY_PROPERTIES + ["build_commit", "test_time", "avg_ts"])
|
||||||
|
elif self.tool == "test-backend-ops":
|
||||||
|
self.check_keys = set(TEST_BACKEND_OPS_KEY_PROPERTIES + ["build_commit", "test_time"])
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
|
||||||
def _builds_init(self):
|
def _builds_init(self):
|
||||||
self.build_len = self.build_len_min
|
self.build_len = self.build_len_min
|
||||||
|
|
||||||
@@ -252,52 +317,121 @@ class LlamaBenchData:
|
|||||||
class LlamaBenchDataSQLite3(LlamaBenchData):
|
class LlamaBenchDataSQLite3(LlamaBenchData):
|
||||||
connection: sqlite3.Connection
|
connection: sqlite3.Connection
|
||||||
cursor: sqlite3.Cursor
|
cursor: sqlite3.Cursor
|
||||||
|
table_name: str
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, tool: str = "llama-bench"):
|
||||||
super().__init__()
|
super().__init__(tool)
|
||||||
self.connection = sqlite3.connect(":memory:")
|
self.connection = sqlite3.connect(":memory:")
|
||||||
self.cursor = self.connection.cursor()
|
self.cursor = self.connection.cursor()
|
||||||
self.cursor.execute(f"CREATE TABLE test({', '.join(' '.join(x) for x in zip(DB_FIELDS, DB_TYPES))});")
|
|
||||||
|
# Set table name and schema based on tool
|
||||||
|
if self.tool == "llama-bench":
|
||||||
|
self.table_name = "test"
|
||||||
|
db_fields = LLAMA_BENCH_DB_FIELDS
|
||||||
|
db_types = LLAMA_BENCH_DB_TYPES
|
||||||
|
elif self.tool == "test-backend-ops":
|
||||||
|
self.table_name = "test_backend_ops"
|
||||||
|
db_fields = TEST_BACKEND_OPS_DB_FIELDS
|
||||||
|
db_types = TEST_BACKEND_OPS_DB_TYPES
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
|
||||||
|
self.cursor.execute(f"CREATE TABLE {self.table_name}({', '.join(' '.join(x) for x in zip(db_fields, db_types))});")
|
||||||
|
|
||||||
def _builds_init(self):
|
def _builds_init(self):
|
||||||
if self.connection:
|
if self.connection:
|
||||||
self.build_len_min = self.cursor.execute("SELECT MIN(LENGTH(build_commit)) from test;").fetchone()[0]
|
self.build_len_min = self.cursor.execute(f"SELECT MIN(LENGTH(build_commit)) from {self.table_name};").fetchone()[0]
|
||||||
self.build_len_max = self.cursor.execute("SELECT MAX(LENGTH(build_commit)) from test;").fetchone()[0]
|
self.build_len_max = self.cursor.execute(f"SELECT MAX(LENGTH(build_commit)) from {self.table_name};").fetchone()[0]
|
||||||
|
|
||||||
if self.build_len_min != self.build_len_max:
|
if self.build_len_min != self.build_len_max:
|
||||||
logger.warning("Data contains commit hashes of differing lengths. It's possible that the wrong commits will be compared. "
|
logger.warning("Data contains commit hashes of differing lengths. It's possible that the wrong commits will be compared. "
|
||||||
"Try purging the the database of old commits.")
|
"Try purging the the database of old commits.")
|
||||||
self.cursor.execute(f"UPDATE test SET build_commit = SUBSTRING(build_commit, 1, {self.build_len_min});")
|
self.cursor.execute(f"UPDATE {self.table_name} SET build_commit = SUBSTRING(build_commit, 1, {self.build_len_min});")
|
||||||
|
|
||||||
builds = self.cursor.execute("SELECT DISTINCT build_commit FROM test;").fetchall()
|
builds = self.cursor.execute(f"SELECT DISTINCT build_commit FROM {self.table_name};").fetchall()
|
||||||
self.builds = list(map(lambda b: b[0], builds)) # list[tuple[str]] -> list[str]
|
self.builds = list(map(lambda b: b[0], builds)) # list[tuple[str]] -> list[str]
|
||||||
super()._builds_init()
|
super()._builds_init()
|
||||||
|
|
||||||
def builds_timestamp(self, reverse: bool = False) -> Union[Iterator[tuple], Sequence[tuple]]:
|
def builds_timestamp(self, reverse: bool = False) -> Union[Iterator[tuple], Sequence[tuple]]:
|
||||||
data = self.cursor.execute(
|
data = self.cursor.execute(
|
||||||
"SELECT build_commit, test_time FROM test ORDER BY test_time;").fetchall()
|
f"SELECT build_commit, test_time FROM {self.table_name} ORDER BY test_time;").fetchall()
|
||||||
return reversed(data) if reverse else data
|
return reversed(data) if reverse else data
|
||||||
|
|
||||||
def get_rows(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare: str) -> Sequence[tuple]:
|
def get_rows(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare: str) -> Sequence[tuple]:
|
||||||
|
if self.tool == "llama-bench":
|
||||||
|
return self._get_rows_llama_bench(properties, hexsha8_baseline, hexsha8_compare)
|
||||||
|
elif self.tool == "test-backend-ops":
|
||||||
|
return self._get_rows_test_backend_ops(properties, hexsha8_baseline, hexsha8_compare)
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
|
||||||
|
def _get_rows_llama_bench(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare: str) -> Sequence[tuple]:
|
||||||
select_string = ", ".join(
|
select_string = ", ".join(
|
||||||
[f"tb.{p}" for p in properties] + ["tb.n_prompt", "tb.n_gen", "tb.n_depth", "AVG(tb.avg_ts)", "AVG(tc.avg_ts)"])
|
[f"tb.{p}" for p in properties] + ["tb.n_prompt", "tb.n_gen", "tb.n_depth", "AVG(tb.avg_ts)", "AVG(tc.avg_ts)"])
|
||||||
equal_string = " AND ".join(
|
equal_string = " AND ".join(
|
||||||
[f"tb.{p} = tc.{p}" for p in KEY_PROPERTIES] + [
|
[f"tb.{p} = tc.{p}" for p in LLAMA_BENCH_KEY_PROPERTIES] + [
|
||||||
f"tb.build_commit = '{hexsha8_baseline}'", f"tc.build_commit = '{hexsha8_compare}'"]
|
f"tb.build_commit = '{hexsha8_baseline}'", f"tc.build_commit = '{hexsha8_compare}'"]
|
||||||
)
|
)
|
||||||
group_order_string = ", ".join([f"tb.{p}" for p in properties] + ["tb.n_gen", "tb.n_prompt", "tb.n_depth"])
|
group_order_string = ", ".join([f"tb.{p}" for p in properties] + ["tb.n_gen", "tb.n_prompt", "tb.n_depth"])
|
||||||
query = (f"SELECT {select_string} FROM test tb JOIN test tc ON {equal_string} "
|
query = (f"SELECT {select_string} FROM {self.table_name} tb JOIN {self.table_name} tc ON {equal_string} "
|
||||||
|
f"GROUP BY {group_order_string} ORDER BY {group_order_string};")
|
||||||
|
return self.cursor.execute(query).fetchall()
|
||||||
|
|
||||||
|
def _get_rows_test_backend_ops(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare: str) -> Sequence[tuple]:
|
||||||
|
# For test-backend-ops, we compare FLOPS and bandwidth metrics (prioritizing FLOPS over bandwidth)
|
||||||
|
select_string = ", ".join(
|
||||||
|
[f"tb.{p}" for p in properties] + [
|
||||||
|
"AVG(tb.flops)", "AVG(tc.flops)",
|
||||||
|
"AVG(tb.bandwidth_gb_s)", "AVG(tc.bandwidth_gb_s)"
|
||||||
|
])
|
||||||
|
equal_string = " AND ".join(
|
||||||
|
[f"tb.{p} = tc.{p}" for p in TEST_BACKEND_OPS_KEY_PROPERTIES] + [
|
||||||
|
f"tb.build_commit = '{hexsha8_baseline}'", f"tc.build_commit = '{hexsha8_compare}'",
|
||||||
|
"tb.supported = 1", "tc.supported = 1", "tb.passed = 1", "tc.passed = 1"] # Only compare successful tests
|
||||||
|
)
|
||||||
|
group_order_string = ", ".join([f"tb.{p}" for p in properties])
|
||||||
|
query = (f"SELECT {select_string} FROM {self.table_name} tb JOIN {self.table_name} tc ON {equal_string} "
|
||||||
f"GROUP BY {group_order_string} ORDER BY {group_order_string};")
|
f"GROUP BY {group_order_string} ORDER BY {group_order_string};")
|
||||||
return self.cursor.execute(query).fetchall()
|
return self.cursor.execute(query).fetchall()
|
||||||
|
|
||||||
|
|
||||||
class LlamaBenchDataSQLite3File(LlamaBenchDataSQLite3):
|
class LlamaBenchDataSQLite3File(LlamaBenchDataSQLite3):
|
||||||
def __init__(self, data_file: str):
|
def __init__(self, data_file: str, tool: Any):
|
||||||
super().__init__()
|
super().__init__(tool)
|
||||||
|
|
||||||
self.connection.close()
|
self.connection.close()
|
||||||
self.connection = sqlite3.connect(data_file)
|
self.connection = sqlite3.connect(data_file)
|
||||||
self.cursor = self.connection.cursor()
|
self.cursor = self.connection.cursor()
|
||||||
|
|
||||||
|
# Check which table exists in the database
|
||||||
|
tables = self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()
|
||||||
|
table_names = [table[0] for table in tables]
|
||||||
|
|
||||||
|
# Tool selection logic
|
||||||
|
if tool is None:
|
||||||
|
if "test" in table_names:
|
||||||
|
self.table_name = "test"
|
||||||
|
self.tool = "llama-bench"
|
||||||
|
elif "test_backend_ops" in table_names:
|
||||||
|
self.table_name = "test_backend_ops"
|
||||||
|
self.tool = "test-backend-ops"
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"No suitable table found in database. Available tables: {table_names}")
|
||||||
|
elif tool == "llama-bench":
|
||||||
|
if "test" in table_names:
|
||||||
|
self.table_name = "test"
|
||||||
|
self.tool = "llama-bench"
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"Table 'test' not found for tool 'llama-bench'. Available tables: {table_names}")
|
||||||
|
elif tool == "test-backend-ops":
|
||||||
|
if "test_backend_ops" in table_names:
|
||||||
|
self.table_name = "test_backend_ops"
|
||||||
|
self.tool = "test-backend-ops"
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"Table 'test_backend_ops' not found for tool 'test-backend-ops'. Available tables: {table_names}")
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"Unknown tool: {tool}")
|
||||||
|
|
||||||
self._builds_init()
|
self._builds_init()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -317,20 +451,23 @@ class LlamaBenchDataSQLite3File(LlamaBenchDataSQLite3):
|
|||||||
|
|
||||||
|
|
||||||
class LlamaBenchDataJSONL(LlamaBenchDataSQLite3):
|
class LlamaBenchDataJSONL(LlamaBenchDataSQLite3):
|
||||||
def __init__(self, data_file: str):
|
def __init__(self, data_file: str, tool: str = "llama-bench"):
|
||||||
super().__init__()
|
super().__init__(tool)
|
||||||
|
|
||||||
|
# Get the appropriate field list based on tool
|
||||||
|
db_fields = LLAMA_BENCH_DB_FIELDS if tool == "llama-bench" else TEST_BACKEND_OPS_DB_FIELDS
|
||||||
|
|
||||||
with open(data_file, "r", encoding="utf-8") as fp:
|
with open(data_file, "r", encoding="utf-8") as fp:
|
||||||
for i, line in enumerate(fp):
|
for i, line in enumerate(fp):
|
||||||
parsed = json.loads(line)
|
parsed = json.loads(line)
|
||||||
|
|
||||||
for k in parsed.keys() - set(DB_FIELDS):
|
for k in parsed.keys() - set(db_fields):
|
||||||
del parsed[k]
|
del parsed[k]
|
||||||
|
|
||||||
if (missing_keys := self._check_keys(parsed.keys())):
|
if (missing_keys := self._check_keys(parsed.keys())):
|
||||||
raise RuntimeError(f"Missing required data key(s) at line {i + 1}: {', '.join(missing_keys)}")
|
raise RuntimeError(f"Missing required data key(s) at line {i + 1}: {', '.join(missing_keys)}")
|
||||||
|
|
||||||
self.cursor.execute(f"INSERT INTO test({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
|
self.cursor.execute(f"INSERT INTO {self.table_name}({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
|
||||||
|
|
||||||
self._builds_init()
|
self._builds_init()
|
||||||
|
|
||||||
@@ -349,21 +486,24 @@ class LlamaBenchDataJSONL(LlamaBenchDataSQLite3):
|
|||||||
|
|
||||||
|
|
||||||
class LlamaBenchDataJSON(LlamaBenchDataSQLite3):
|
class LlamaBenchDataJSON(LlamaBenchDataSQLite3):
|
||||||
def __init__(self, data_files: list[str]):
|
def __init__(self, data_files: list[str], tool: str = "llama-bench"):
|
||||||
super().__init__()
|
super().__init__(tool)
|
||||||
|
|
||||||
|
# Get the appropriate field list based on tool
|
||||||
|
db_fields = LLAMA_BENCH_DB_FIELDS if tool == "llama-bench" else TEST_BACKEND_OPS_DB_FIELDS
|
||||||
|
|
||||||
for data_file in data_files:
|
for data_file in data_files:
|
||||||
with open(data_file, "r", encoding="utf-8") as fp:
|
with open(data_file, "r", encoding="utf-8") as fp:
|
||||||
parsed = json.load(fp)
|
parsed = json.load(fp)
|
||||||
|
|
||||||
for i, entry in enumerate(parsed):
|
for i, entry in enumerate(parsed):
|
||||||
for k in entry.keys() - set(DB_FIELDS):
|
for k in entry.keys() - set(db_fields):
|
||||||
del entry[k]
|
del entry[k]
|
||||||
|
|
||||||
if (missing_keys := self._check_keys(entry.keys())):
|
if (missing_keys := self._check_keys(entry.keys())):
|
||||||
raise RuntimeError(f"Missing required data key(s) at entry {i + 1}: {', '.join(missing_keys)}")
|
raise RuntimeError(f"Missing required data key(s) at entry {i + 1}: {', '.join(missing_keys)}")
|
||||||
|
|
||||||
self.cursor.execute(f"INSERT INTO test({', '.join(entry.keys())}) VALUES({', '.join('?' * len(entry))});", tuple(entry.values()))
|
self.cursor.execute(f"INSERT INTO {self.table_name}({', '.join(entry.keys())}) VALUES({', '.join('?' * len(entry))});", tuple(entry.values()))
|
||||||
|
|
||||||
self._builds_init()
|
self._builds_init()
|
||||||
|
|
||||||
@@ -384,21 +524,24 @@ class LlamaBenchDataJSON(LlamaBenchDataSQLite3):
|
|||||||
|
|
||||||
|
|
||||||
class LlamaBenchDataCSV(LlamaBenchDataSQLite3):
|
class LlamaBenchDataCSV(LlamaBenchDataSQLite3):
|
||||||
def __init__(self, data_files: list[str]):
|
def __init__(self, data_files: list[str], tool: str = "llama-bench"):
|
||||||
super().__init__()
|
super().__init__(tool)
|
||||||
|
|
||||||
|
# Get the appropriate field list based on tool
|
||||||
|
db_fields = LLAMA_BENCH_DB_FIELDS if tool == "llama-bench" else TEST_BACKEND_OPS_DB_FIELDS
|
||||||
|
|
||||||
for data_file in data_files:
|
for data_file in data_files:
|
||||||
with open(data_file, "r", encoding="utf-8") as fp:
|
with open(data_file, "r", encoding="utf-8") as fp:
|
||||||
for i, parsed in enumerate(csv.DictReader(fp)):
|
for i, parsed in enumerate(csv.DictReader(fp)):
|
||||||
keys = set(parsed.keys())
|
keys = set(parsed.keys())
|
||||||
|
|
||||||
for k in keys - set(DB_FIELDS):
|
for k in keys - set(db_fields):
|
||||||
del parsed[k]
|
del parsed[k]
|
||||||
|
|
||||||
if (missing_keys := self._check_keys(keys)):
|
if (missing_keys := self._check_keys(keys)):
|
||||||
raise RuntimeError(f"Missing required data key(s) at line {i + 1}: {', '.join(missing_keys)}")
|
raise RuntimeError(f"Missing required data key(s) at line {i + 1}: {', '.join(missing_keys)}")
|
||||||
|
|
||||||
self.cursor.execute(f"INSERT INTO test({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
|
self.cursor.execute(f"INSERT INTO {self.table_name}({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
|
||||||
|
|
||||||
self._builds_init()
|
self._builds_init()
|
||||||
|
|
||||||
@@ -419,21 +562,90 @@ class LlamaBenchDataCSV(LlamaBenchDataSQLite3):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def format_flops(flops_value: float) -> str:
|
||||||
|
"""Format FLOPS values with appropriate units for better readability."""
|
||||||
|
if flops_value == 0:
|
||||||
|
return "0.00"
|
||||||
|
|
||||||
|
# Define unit thresholds and names
|
||||||
|
units = [
|
||||||
|
(1e12, "T"), # TeraFLOPS
|
||||||
|
(1e9, "G"), # GigaFLOPS
|
||||||
|
(1e6, "M"), # MegaFLOPS
|
||||||
|
(1e3, "k"), # kiloFLOPS
|
||||||
|
(1, "") # FLOPS
|
||||||
|
]
|
||||||
|
|
||||||
|
for threshold, unit in units:
|
||||||
|
if abs(flops_value) >= threshold:
|
||||||
|
formatted_value = flops_value / threshold
|
||||||
|
if formatted_value >= 100:
|
||||||
|
return f"{formatted_value:.1f}{unit}"
|
||||||
|
else:
|
||||||
|
return f"{formatted_value:.2f}{unit}"
|
||||||
|
|
||||||
|
# Fallback for very small values
|
||||||
|
return f"{flops_value:.2f}"
|
||||||
|
|
||||||
|
|
||||||
|
def format_flops_for_table(flops_value: float, target_unit: str) -> str:
|
||||||
|
"""Format FLOPS values for table display without unit suffix (since unit is in header)."""
|
||||||
|
if flops_value == 0:
|
||||||
|
return "0.00"
|
||||||
|
|
||||||
|
# Define unit thresholds based on target unit
|
||||||
|
unit_divisors = {
|
||||||
|
"TFLOPS": 1e12,
|
||||||
|
"GFLOPS": 1e9,
|
||||||
|
"MFLOPS": 1e6,
|
||||||
|
"kFLOPS": 1e3,
|
||||||
|
"FLOPS": 1
|
||||||
|
}
|
||||||
|
|
||||||
|
divisor = unit_divisors.get(target_unit, 1)
|
||||||
|
formatted_value = flops_value / divisor
|
||||||
|
|
||||||
|
if formatted_value >= 100:
|
||||||
|
return f"{formatted_value:.1f}"
|
||||||
|
else:
|
||||||
|
return f"{formatted_value:.2f}"
|
||||||
|
|
||||||
|
|
||||||
|
def get_flops_unit_name(flops_values: list) -> str:
|
||||||
|
"""Determine the best FLOPS unit name based on the magnitude of values."""
|
||||||
|
if not flops_values or all(v == 0 for v in flops_values):
|
||||||
|
return "FLOPS"
|
||||||
|
|
||||||
|
# Find the maximum absolute value to determine appropriate unit
|
||||||
|
max_flops = max(abs(v) for v in flops_values if v != 0)
|
||||||
|
|
||||||
|
if max_flops >= 1e12:
|
||||||
|
return "TFLOPS"
|
||||||
|
elif max_flops >= 1e9:
|
||||||
|
return "GFLOPS"
|
||||||
|
elif max_flops >= 1e6:
|
||||||
|
return "MFLOPS"
|
||||||
|
elif max_flops >= 1e3:
|
||||||
|
return "kFLOPS"
|
||||||
|
else:
|
||||||
|
return "FLOPS"
|
||||||
|
|
||||||
|
|
||||||
bench_data = None
|
bench_data = None
|
||||||
if len(input_file) == 1:
|
if len(input_file) == 1:
|
||||||
if LlamaBenchDataSQLite3File.valid_format(input_file[0]):
|
if LlamaBenchDataSQLite3File.valid_format(input_file[0]):
|
||||||
bench_data = LlamaBenchDataSQLite3File(input_file[0])
|
bench_data = LlamaBenchDataSQLite3File(input_file[0], tool)
|
||||||
elif LlamaBenchDataJSON.valid_format(input_file):
|
elif LlamaBenchDataJSON.valid_format(input_file):
|
||||||
bench_data = LlamaBenchDataJSON(input_file)
|
bench_data = LlamaBenchDataJSON(input_file, tool)
|
||||||
elif LlamaBenchDataJSONL.valid_format(input_file[0]):
|
elif LlamaBenchDataJSONL.valid_format(input_file[0]):
|
||||||
bench_data = LlamaBenchDataJSONL(input_file[0])
|
bench_data = LlamaBenchDataJSONL(input_file[0], tool)
|
||||||
elif LlamaBenchDataCSV.valid_format(input_file):
|
elif LlamaBenchDataCSV.valid_format(input_file):
|
||||||
bench_data = LlamaBenchDataCSV(input_file)
|
bench_data = LlamaBenchDataCSV(input_file, tool)
|
||||||
else:
|
else:
|
||||||
if LlamaBenchDataJSON.valid_format(input_file):
|
if LlamaBenchDataJSON.valid_format(input_file):
|
||||||
bench_data = LlamaBenchDataJSON(input_file)
|
bench_data = LlamaBenchDataJSON(input_file, tool)
|
||||||
elif LlamaBenchDataCSV.valid_format(input_file):
|
elif LlamaBenchDataCSV.valid_format(input_file):
|
||||||
bench_data = LlamaBenchDataCSV(input_file)
|
bench_data = LlamaBenchDataCSV(input_file, tool)
|
||||||
|
|
||||||
if not bench_data:
|
if not bench_data:
|
||||||
raise RuntimeError("No valid (or some invalid) input files found.")
|
raise RuntimeError("No valid (or some invalid) input files found.")
|
||||||
@@ -504,12 +716,29 @@ else:
|
|||||||
|
|
||||||
name_compare = bench_data.get_commit_name(hexsha8_compare)
|
name_compare = bench_data.get_commit_name(hexsha8_compare)
|
||||||
|
|
||||||
|
# Get tool-specific configuration
|
||||||
|
if tool == "llama-bench":
|
||||||
|
key_properties = LLAMA_BENCH_KEY_PROPERTIES
|
||||||
|
bool_properties = LLAMA_BENCH_BOOL_PROPERTIES
|
||||||
|
pretty_names = LLAMA_BENCH_PRETTY_NAMES
|
||||||
|
default_show = DEFAULT_SHOW_LLAMA_BENCH
|
||||||
|
default_hide = DEFAULT_HIDE_LLAMA_BENCH
|
||||||
|
elif tool == "test-backend-ops":
|
||||||
|
key_properties = TEST_BACKEND_OPS_KEY_PROPERTIES
|
||||||
|
bool_properties = TEST_BACKEND_OPS_BOOL_PROPERTIES
|
||||||
|
pretty_names = TEST_BACKEND_OPS_PRETTY_NAMES
|
||||||
|
default_show = DEFAULT_SHOW_TEST_BACKEND_OPS
|
||||||
|
default_hide = DEFAULT_HIDE_TEST_BACKEND_OPS
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
|
||||||
# If the user provided columns to group the results by, use them:
|
# If the user provided columns to group the results by, use them:
|
||||||
if known_args.show is not None:
|
if known_args.show is not None:
|
||||||
show = known_args.show.split(",")
|
show = known_args.show.split(",")
|
||||||
unknown_cols = []
|
unknown_cols = []
|
||||||
for prop in show:
|
for prop in show:
|
||||||
if prop not in KEY_PROPERTIES[:-3]: # Last three values are n_prompt, n_gen, n_depth.
|
valid_props = key_properties if tool == "test-backend-ops" else key_properties[:-3] # Exclude n_prompt, n_gen, n_depth for llama-bench
|
||||||
|
if prop not in valid_props:
|
||||||
unknown_cols.append(prop)
|
unknown_cols.append(prop)
|
||||||
if unknown_cols:
|
if unknown_cols:
|
||||||
logger.error(f"Unknown values for --show: {', '.join(unknown_cols)}")
|
logger.error(f"Unknown values for --show: {', '.join(unknown_cols)}")
|
||||||
@@ -518,20 +747,37 @@ if known_args.show is not None:
|
|||||||
rows_show = bench_data.get_rows(show, hexsha8_baseline, hexsha8_compare)
|
rows_show = bench_data.get_rows(show, hexsha8_baseline, hexsha8_compare)
|
||||||
# Otherwise, select those columns where the values are not all the same:
|
# Otherwise, select those columns where the values are not all the same:
|
||||||
else:
|
else:
|
||||||
rows_full = bench_data.get_rows(KEY_PROPERTIES, hexsha8_baseline, hexsha8_compare)
|
rows_full = bench_data.get_rows(key_properties, hexsha8_baseline, hexsha8_compare)
|
||||||
properties_different = []
|
properties_different = []
|
||||||
for i, kp_i in enumerate(KEY_PROPERTIES):
|
|
||||||
if kp_i in DEFAULT_SHOW or kp_i in ["n_prompt", "n_gen", "n_depth"]:
|
if tool == "llama-bench":
|
||||||
|
# For llama-bench, skip n_prompt, n_gen, n_depth from differentiation logic
|
||||||
|
check_properties = [kp for kp in key_properties if kp not in ["n_prompt", "n_gen", "n_depth"]]
|
||||||
|
for i, kp_i in enumerate(key_properties):
|
||||||
|
if kp_i in default_show or kp_i in ["n_prompt", "n_gen", "n_depth"]:
|
||||||
continue
|
continue
|
||||||
for row_full in rows_full:
|
for row_full in rows_full:
|
||||||
if row_full[i] != rows_full[0][i]:
|
if row_full[i] != rows_full[0][i]:
|
||||||
properties_different.append(kp_i)
|
properties_different.append(kp_i)
|
||||||
break
|
break
|
||||||
|
elif tool == "test-backend-ops":
|
||||||
|
# For test-backend-ops, check all key properties
|
||||||
|
for i, kp_i in enumerate(key_properties):
|
||||||
|
if kp_i in default_show:
|
||||||
|
continue
|
||||||
|
for row_full in rows_full:
|
||||||
|
if row_full[i] != rows_full[0][i]:
|
||||||
|
properties_different.append(kp_i)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
|
||||||
show = []
|
show = []
|
||||||
|
|
||||||
|
if tool == "llama-bench":
|
||||||
# Show CPU and/or GPU by default even if the hardware for all results is the same:
|
# Show CPU and/or GPU by default even if the hardware for all results is the same:
|
||||||
if rows_full and "n_gpu_layers" not in properties_different:
|
if rows_full and "n_gpu_layers" not in properties_different:
|
||||||
ngl = int(rows_full[0][KEY_PROPERTIES.index("n_gpu_layers")])
|
ngl = int(rows_full[0][key_properties.index("n_gpu_layers")])
|
||||||
|
|
||||||
if ngl != 99 and "cpu_info" not in properties_different:
|
if ngl != 99 and "cpu_info" not in properties_different:
|
||||||
show.append("cpu_info")
|
show.append("cpu_info")
|
||||||
@@ -542,8 +788,13 @@ else:
|
|||||||
for prop in ["cpu_info", "gpu_info", "n_gpu_layers", "main_gpu"]:
|
for prop in ["cpu_info", "gpu_info", "n_gpu_layers", "main_gpu"]:
|
||||||
if prop in show:
|
if prop in show:
|
||||||
index_default += 1
|
index_default += 1
|
||||||
show = show[:index_default] + DEFAULT_SHOW + show[index_default:]
|
show = show[:index_default] + default_show + show[index_default:]
|
||||||
for prop in DEFAULT_HIDE:
|
elif tool == "test-backend-ops":
|
||||||
|
show = default_show + properties_different
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
|
||||||
|
for prop in default_hide:
|
||||||
try:
|
try:
|
||||||
show.remove(prop)
|
show.remove(prop)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
@@ -551,7 +802,7 @@ else:
|
|||||||
|
|
||||||
# Add plot_x parameter to parameters to show if it's not already present:
|
# Add plot_x parameter to parameters to show if it's not already present:
|
||||||
if known_args.plot:
|
if known_args.plot:
|
||||||
for k, v in PRETTY_NAMES.items():
|
for k, v in pretty_names.items():
|
||||||
if v == known_args.plot_x and k not in show:
|
if v == known_args.plot_x and k not in show:
|
||||||
show.append(k)
|
show.append(k)
|
||||||
break
|
break
|
||||||
@@ -563,7 +814,11 @@ if not rows_show:
|
|||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
table = []
|
table = []
|
||||||
for row in rows_show:
|
primary_metric = "FLOPS" # Default to FLOPS for test-backend-ops
|
||||||
|
|
||||||
|
if tool == "llama-bench":
|
||||||
|
# For llama-bench, create test names and compare avg_ts values
|
||||||
|
for row in rows_show:
|
||||||
n_prompt = int(row[-5])
|
n_prompt = int(row[-5])
|
||||||
n_gen = int(row[-4])
|
n_gen = int(row[-4])
|
||||||
n_depth = int(row[-3])
|
n_depth = int(row[-3])
|
||||||
@@ -578,26 +833,77 @@ for row in rows_show:
|
|||||||
# Regular columns test name avg t/s values Speedup
|
# Regular columns test name avg t/s values Speedup
|
||||||
# VVVVVVVVVVVVV VVVVVVVVV VVVVVVVVVVVVVV VVVVVVV
|
# VVVVVVVVVVVVV VVVVVVVVV VVVVVVVVVVVVVV VVVVVVV
|
||||||
table.append(list(row[:-5]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])])
|
table.append(list(row[:-5]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])])
|
||||||
|
elif tool == "test-backend-ops":
|
||||||
|
# Determine the primary metric by checking rows until we find one with valid data
|
||||||
|
if rows_show:
|
||||||
|
primary_metric = "FLOPS" # Default to FLOPS
|
||||||
|
flops_values = []
|
||||||
|
|
||||||
|
# Collect all FLOPS values to determine the best unit
|
||||||
|
for sample_row in rows_show:
|
||||||
|
baseline_flops = float(sample_row[-4])
|
||||||
|
compare_flops = float(sample_row[-3])
|
||||||
|
baseline_bandwidth = float(sample_row[-2])
|
||||||
|
|
||||||
|
if baseline_flops > 0:
|
||||||
|
flops_values.extend([baseline_flops, compare_flops])
|
||||||
|
elif baseline_bandwidth > 0 and not flops_values:
|
||||||
|
primary_metric = "Bandwidth (GB/s)"
|
||||||
|
|
||||||
|
# If we have FLOPS data, determine the appropriate unit
|
||||||
|
if flops_values:
|
||||||
|
primary_metric = get_flops_unit_name(flops_values)
|
||||||
|
|
||||||
|
# For test-backend-ops, prioritize FLOPS > bandwidth for comparison
|
||||||
|
for row in rows_show:
|
||||||
|
# Extract metrics: flops, bandwidth_gb_s (baseline and compare)
|
||||||
|
baseline_flops = float(row[-4])
|
||||||
|
compare_flops = float(row[-3])
|
||||||
|
baseline_bandwidth = float(row[-2])
|
||||||
|
compare_bandwidth = float(row[-1])
|
||||||
|
|
||||||
|
# Determine which metric to use for comparison (prioritize FLOPS > bandwidth)
|
||||||
|
if baseline_flops > 0 and compare_flops > 0:
|
||||||
|
# Use FLOPS comparison (higher is better)
|
||||||
|
speedup = compare_flops / baseline_flops
|
||||||
|
baseline_str = format_flops_for_table(baseline_flops, primary_metric)
|
||||||
|
compare_str = format_flops_for_table(compare_flops, primary_metric)
|
||||||
|
elif baseline_bandwidth > 0 and compare_bandwidth > 0:
|
||||||
|
# Use bandwidth comparison (higher is better)
|
||||||
|
speedup = compare_bandwidth / baseline_bandwidth
|
||||||
|
baseline_str = f"{baseline_bandwidth:.2f}"
|
||||||
|
compare_str = f"{compare_bandwidth:.2f}"
|
||||||
|
else:
|
||||||
|
# Fallback if no valid data is available
|
||||||
|
baseline_str = "N/A"
|
||||||
|
compare_str = "N/A"
|
||||||
|
from math import nan
|
||||||
|
speedup = nan
|
||||||
|
|
||||||
|
table.append(list(row[:-4]) + [baseline_str, compare_str, speedup])
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
|
||||||
# Some a-posteriori fixes to make the table contents prettier:
|
# Some a-posteriori fixes to make the table contents prettier:
|
||||||
for bool_property in BOOL_PROPERTIES:
|
for bool_property in bool_properties:
|
||||||
if bool_property in show:
|
if bool_property in show:
|
||||||
ip = show.index(bool_property)
|
ip = show.index(bool_property)
|
||||||
for row_table in table:
|
for row_table in table:
|
||||||
row_table[ip] = "Yes" if int(row_table[ip]) == 1 else "No"
|
row_table[ip] = "Yes" if int(row_table[ip]) == 1 else "No"
|
||||||
|
|
||||||
if "model_type" in show:
|
if tool == "llama-bench":
|
||||||
|
if "model_type" in show:
|
||||||
ip = show.index("model_type")
|
ip = show.index("model_type")
|
||||||
for (old, new) in MODEL_SUFFIX_REPLACE.items():
|
for (old, new) in MODEL_SUFFIX_REPLACE.items():
|
||||||
for row_table in table:
|
for row_table in table:
|
||||||
row_table[ip] = row_table[ip].replace(old, new)
|
row_table[ip] = row_table[ip].replace(old, new)
|
||||||
|
|
||||||
if "model_size" in show:
|
if "model_size" in show:
|
||||||
ip = show.index("model_size")
|
ip = show.index("model_size")
|
||||||
for row_table in table:
|
for row_table in table:
|
||||||
row_table[ip] = float(row_table[ip]) / 1024 ** 3
|
row_table[ip] = float(row_table[ip]) / 1024 ** 3
|
||||||
|
|
||||||
if "gpu_info" in show:
|
if "gpu_info" in show:
|
||||||
ip = show.index("gpu_info")
|
ip = show.index("gpu_info")
|
||||||
for row_table in table:
|
for row_table in table:
|
||||||
for gns in GPU_NAME_STRIP:
|
for gns in GPU_NAME_STRIP:
|
||||||
@@ -609,14 +915,19 @@ if "gpu_info" in show:
|
|||||||
if len(gpu_names) >= 2 and all_names_the_same:
|
if len(gpu_names) >= 2 and all_names_the_same:
|
||||||
row_table[ip] = f"{num_gpus}x {gpu_names[0]}"
|
row_table[ip] = f"{num_gpus}x {gpu_names[0]}"
|
||||||
|
|
||||||
headers = [PRETTY_NAMES[p] for p in show]
|
headers = [pretty_names.get(p, p) for p in show]
|
||||||
headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]
|
if tool == "llama-bench":
|
||||||
|
headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]
|
||||||
|
elif tool == "test-backend-ops":
|
||||||
|
headers += [f"{primary_metric} {name_baseline}", f"{primary_metric} {name_compare}", "Speedup"]
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
|
||||||
if known_args.plot:
|
if known_args.plot:
|
||||||
def create_performance_plot(table_data: list[list[str]], headers: list[str], baseline_name: str, compare_name: str, output_file: str, plot_x_param: str, log_scale: bool = False):
|
def create_performance_plot(table_data: list[list[str]], headers: list[str], baseline_name: str, compare_name: str, output_file: str, plot_x_param: str, log_scale: bool = False, tool_type: str = "llama-bench", metric_name: str = "t/s"):
|
||||||
try:
|
try:
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
import matplotlib
|
import matplotlib
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
matplotlib.use('Agg')
|
matplotlib.use('Agg')
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
logger.error("matplotlib is required for --plot.")
|
logger.error("matplotlib is required for --plot.")
|
||||||
@@ -627,7 +938,7 @@ if known_args.plot:
|
|||||||
plot_x_label = plot_x_param
|
plot_x_label = plot_x_param
|
||||||
|
|
||||||
if plot_x_param not in ["n_prompt", "n_gen", "n_depth"]:
|
if plot_x_param not in ["n_prompt", "n_gen", "n_depth"]:
|
||||||
pretty_name = PRETTY_NAMES.get(plot_x_param, plot_x_param)
|
pretty_name = LLAMA_BENCH_PRETTY_NAMES.get(plot_x_param, plot_x_param)
|
||||||
if pretty_name in data_headers:
|
if pretty_name in data_headers:
|
||||||
plot_x_index = data_headers.index(pretty_name)
|
plot_x_index = data_headers.index(pretty_name)
|
||||||
plot_x_label = pretty_name
|
plot_x_label = pretty_name
|
||||||
@@ -746,8 +1057,16 @@ if known_args.plot:
|
|||||||
|
|
||||||
title = ', '.join(title_parts) if title_parts else "Performance comparison"
|
title = ', '.join(title_parts) if title_parts else "Performance comparison"
|
||||||
|
|
||||||
|
# Determine y-axis label based on tool type
|
||||||
|
if tool_type == "llama-bench":
|
||||||
|
y_label = "Tokens per second (t/s)"
|
||||||
|
elif tool_type == "test-backend-ops":
|
||||||
|
y_label = metric_name
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
|
||||||
ax.set_xlabel(plot_x_label, fontsize=12, fontweight='bold')
|
ax.set_xlabel(plot_x_label, fontsize=12, fontweight='bold')
|
||||||
ax.set_ylabel('Tokens per second (t/s)', fontsize=12, fontweight='bold')
|
ax.set_ylabel(y_label, fontsize=12, fontweight='bold')
|
||||||
ax.set_title(title, fontsize=12, fontweight='bold')
|
ax.set_title(title, fontsize=12, fontweight='bold')
|
||||||
ax.legend(loc='best', fontsize=10)
|
ax.legend(loc='best', fontsize=10)
|
||||||
ax.grid(True, alpha=0.3)
|
ax.grid(True, alpha=0.3)
|
||||||
@@ -765,7 +1084,7 @@ if known_args.plot:
|
|||||||
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
||||||
plt.close()
|
plt.close()
|
||||||
|
|
||||||
create_performance_plot(table, headers, name_baseline, name_compare, known_args.plot, known_args.plot_x, known_args.plot_log_scale)
|
create_performance_plot(table, headers, name_baseline, name_compare, known_args.plot, known_args.plot_x, known_args.plot_log_scale, tool, primary_metric)
|
||||||
|
|
||||||
print(tabulate( # noqa: NP100
|
print(tabulate( # noqa: NP100
|
||||||
table,
|
table,
|
||||||
|
Reference in New Issue
Block a user