Skip to content

Commit 37ff27e

Browse files
authored
Merge pull request #93 from python-constraint/performance_tests
Automatic performance benchmarking
2 parents 2d3885d + 9fd0036 commit 37ff27e

File tree

7 files changed

+371
-61
lines changed

7 files changed

+371
-61
lines changed

β€Ž.github/workflows/build-test-python-package.yml

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,23 @@ jobs:
2525
- uses: actions/checkout@v4
2626
- uses: fjwillemsen/setup-nox2@v3.0.0
2727
- run: |
28-
nox
28+
nox -- ${{ runner.os }}
29+
- name: Store benchmark result
30+
uses: benchmark-action/github-action-benchmark@v1
31+
with:
32+
tool: "pytest"
33+
output-file-path: .benchmarks/benchmark_${{ runner.os }}_3.13.json
34+
gh-pages-branch: main
35+
benchmark-data-dir-path: docs/benchmarks
36+
fail-on-alert: true
37+
# GitHub API token to make a commit comment
38+
github-token: ${{ secrets.GITHUB_TOKEN }}
39+
comment-on-alert: true
40+
comment-always: true
41+
# alert-comment-cc-users: '@fjwillemsen' mention a GitHub user in the comment
2942
- name: Report to Coveralls
3043
uses: coverallsapp/github-action@v2
3144
with:
3245
file: coverage.xml
3346
format: cobertura
47+
fail-on-error: false

β€Ž.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ pip-log.txt
3434
pip-delete-this-directory.txt
3535

3636
# Unit test / coverage reports
37+
.benchmarks
3738
htmlcov/
3839
.tox/
3940
.coverage

β€Ždocs/benchmarks/.gitkeep

Whitespace-only changes.

β€Žnoxfile.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import nox
99
from nox import Session, session
10+
from pathlib import Path
1011

1112
# from nox_poetry import Session, session # nox_poetry is a better option, but <=1.0.3 has a bug with filename-URLs
1213

@@ -21,6 +22,9 @@
2122
nox.options.stop_on_first_error = True
2223
nox.options.error_on_missing_interpreters = True
2324

25+
# create the benchmark folder
26+
Path(".benchmarks").mkdir(exist_ok=True)
27+
2428

2529
# Test code quality: linting
2630
@session
@@ -35,13 +39,19 @@ def lint(session: Session) -> None:
3539
# do not forget check / set the versions with `pyenv global`, or `pyenv local` in case of virtual environment
3640
def tests(session: Session) -> None:
3741
"""Run the tests for the specified Python versions."""
42+
# get command line arguments
43+
if session.posargs:
44+
os_name = session.posargs[0]
45+
else:
46+
os_name = 'local'
47+
3848
# install the dev-dependencies and build the package
3949
session.install("poetry")
4050
session.run("poetry", "install", "--with", "dev,test", external=True)
4151
# session.poetry.installroot(distribution_format="sdist")
4252

4353
# run pytest on the package with C-extensions, disable required coverage percentage
44-
session.run("pytest", "--no-cov")
54+
session.run("pytest", "--no-cov", "--benchmark-json", f".benchmarks/benchmark_{os_name}_{session.python}.json")
4555

4656
# for the last Python version session:
4757
if session.python == python_versions_to_test[-1]:

β€Žpoetry.lock

Lines changed: 96 additions & 59 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

β€Žpyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ sphinx-pyproject = "^0.3.0"
6565
optional = true
6666
[tool.poetry.group.test.dependencies]
6767
pytest = "^8.3.3"
68+
pytest-benchmark = "^5.1.0"
6869
pytest-cov = "^6.0.0"
6970
nox = "^2024.10.9"
7071
ruff = "^0.7.2"

β€Žtests/test_benchmark.py

Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
from random import random
2+
from time import perf_counter
3+
import pytest
4+
from constraint import Problem
5+
from math import sqrt
6+
7+
8+
# reference times (using A4000 on DAS6)
9+
reference_microbenchmark_mean = [0.3784186691045761, 0.4737640768289566, 0.10726054509480794, 0.10744890073935191, 0.10979799057046573, 0.15360217044750848, 0.14483965436617532, 0.054416230569283165, 0.13835338006416956, 0.1371802551050981] # noqa E501
10+
reference_results = {
11+
"microhh": 1.1565620,
12+
"dedispersion": 0.1171140,
13+
"hotspot": 2.6839208,
14+
}
15+
# device properties (for A4000 on DAS6 using get_opencl_device_info.cpp)
16+
dev = {
17+
"max_threads": 1024,
18+
"max_threads_per_sm": 1024,
19+
"max_threads_per_block": 1536,
20+
"max_shared_memory_per_block": 49152,
21+
"max_shared_memory": 102400,
22+
"max_wi_size": [1024, 1024, 64],
23+
"max_wg_size": 1024,
24+
}
25+
# collect benchmark times
26+
benchmark_results = dict()
27+
28+
@pytest.mark.skip
29+
def get_performance_factor(repeats=3):
30+
"""Run microbenchmarks to indicate how much slower this system is compared to the reference."""
31+
32+
def cpu_1():
33+
"""Matrix multiplication"""
34+
size = 100
35+
A = [[random() for _ in range(size)] for _ in range(size)]
36+
B = [[random() for _ in range(size)] for _ in range(size)]
37+
result = [[sum(A[i][k] * B[k][j] for k in range(size)) for j in range(size)] for i in range(size)]
38+
return result
39+
40+
def cpu_2():
41+
"""Element-wise arithmetic"""
42+
N = 10**6
43+
A = [random() for _ in range(N)]
44+
B = [random() for _ in range(N)]
45+
return [A[i] + B[i] for i in range(N)]
46+
47+
def cpu_3():
48+
"""Addition"""
49+
N = 10**6
50+
return [i + i for i in range(N)]
51+
52+
def cpu_4():
53+
"""Multiplication"""
54+
N = 10**6
55+
return [i * i for i in range(N)]
56+
57+
def cpu_5():
58+
"""Division"""
59+
N = 10**6
60+
return [i / i for i in range(1, N+1)]
61+
62+
def mem_1():
63+
"""Array copying"""
64+
N = 10**6
65+
A = [random() for _ in range(N)]
66+
return A.copy()
67+
68+
def mem_2():
69+
"""Array slicing"""
70+
N = 10**6
71+
A = [random() for _ in range(N)]
72+
return A[::2]
73+
74+
def mem_3():
75+
"""Dictionary lookup"""
76+
N = 10**3
77+
keys = list(range(N))
78+
values = list(range(N))
79+
lst = list(zip(keys, values))
80+
return [next((v for k, v in lst if k == i), None) for i in range(N)]
81+
82+
def cache_1():
83+
"""Sequential array sum"""
84+
N = 10**6
85+
A = [random() for _ in range(N)]
86+
return sum(A)
87+
88+
def cache_2():
89+
"""Strided array sum"""
90+
N = 10**6
91+
A = [random() for _ in range(N)]
92+
return sum(A[::2])
93+
94+
# run the benchmarks
95+
benchmarks = [cpu_1, cpu_2, cpu_3, cpu_4, cpu_5, mem_1, mem_2, mem_3, cache_1, cache_2]
96+
raw_data = [list() for _ in range(repeats)]
97+
for i in range(repeats):
98+
for f in benchmarks:
99+
start = perf_counter()
100+
f()
101+
duration = perf_counter() - start
102+
raw_data[i].append(duration)
103+
104+
# non-Numpy implementation of statistics calculation
105+
transposed_data = list(zip(*raw_data)) # transpose the raw_data to get columns as rows
106+
107+
# calculate mean along axis=0 (column-wise) (`benchmark_data.mean(axis=0)`)
108+
benchmark_mean = [sum(column) / len(column) for column in transposed_data]
109+
110+
# calculate standard deviation along axis=0 (column-wise)
111+
def stddev(column, mean):
112+
variance = sum((x - mean) ** 2 for x in column) / len(column)
113+
return sqrt(variance)
114+
115+
# calculate relative standard deviation (`(benchmark_data.std(axis=0) / abs(np_benchmark_mean))`)
116+
benchmark_std = [stddev(column, mean) for column, mean in zip(transposed_data, benchmark_mean)]
117+
relative_std = [(s / abs(m)) if m != 0 else 0 for s, m in zip(benchmark_std, benchmark_mean)]
118+
119+
# calculate mean relative standard deviation and apply threshold (`max(np.mean(np_relative_std), 0.125)`)
120+
mean_relative_std = max(sum(relative_std) / len(relative_std), 0.125)
121+
122+
# calculate performance factor (`np.mean(np_benchmark_mean / reference_microbenchmark_mean)`)
123+
performance_factor = sum(bm / rm for bm, rm in zip(benchmark_mean, reference_microbenchmark_mean)) / len(benchmark_mean)
124+
return performance_factor, mean_relative_std
125+
126+
performance_factor, mean_relative_std = get_performance_factor()
127+
print(f"\nSystem performance factor: {round(performance_factor, 3)}")
128+
129+
@pytest.mark.skip
130+
def check_benchmark_performance(benchmark_name, mean, std):
131+
"""Utility function to check whether the performance of a benchmark is within the expected range and print information."""
132+
reference_result = reference_results[benchmark_name]
133+
assert mean - std * 2 <= reference_result * (performance_factor + mean_relative_std * 2)
134+
print(f"Reference: {round(reference_result, 3)}, benchmark: {round(mean, 3)}, expected: {round(reference_result * performance_factor, 3)}")
135+
136+
137+
def test_microhh(benchmark):
138+
"""Based on the MicroHH search space in the paper."""
139+
benchmark_name = "microhh"
140+
141+
cta_padding = 0 # default argument
142+
143+
# setup the tunable parameters
144+
problem = Problem()
145+
problem.addVariable("STATIC_STRIDES", [0])
146+
problem.addVariable("TILING_STRATEGY", [0])
147+
problem.addVariable("REWRITE_INTERP", [0])
148+
problem.addVariable("BLOCK_SIZE_X", [1, 2, 4, 8, 16, 32, 128, 256, 512, 1024])
149+
problem.addVariable("BLOCK_SIZE_Y", [1, 2, 4, 8, 16, 32])
150+
problem.addVariable("BLOCK_SIZE_Z", [1, 2, 4])
151+
problem.addVariable("TILING_FACTOR_X", [1, 2, 4, 8])
152+
problem.addVariable("TILING_FACTOR_Y", [1, 2, 4])
153+
problem.addVariable("TILING_FACTOR_Z", [1, 2, 4])
154+
problem.addVariable("LOOP_UNROLL_FACTOR_X",[1, 2, 4, 8])
155+
problem.addVariable("LOOP_UNROLL_FACTOR_Y", [1, 2, 4])
156+
problem.addVariable("LOOP_UNROLL_FACTOR_Z", [1, 2, 4])
157+
problem.addVariable("BLOCKS_PER_MP", [0, 1, 2, 3, 4])
158+
159+
# setup the restrictions
160+
problem.addConstraint([
161+
f"BLOCK_SIZE_X * BLOCK_SIZE_Y * BLOCK_SIZE_Z * BLOCKS_PER_MP <= {dev['max_threads_per_sm']}",
162+
f"32 <= BLOCK_SIZE_X * BLOCK_SIZE_Y * BLOCK_SIZE_Z <= {dev['max_threads_per_block']}",
163+
"LOOP_UNROLL_FACTOR_X == 0 or TILING_FACTOR_X % LOOP_UNROLL_FACTOR_X == 0",
164+
"LOOP_UNROLL_FACTOR_Y == 0 or TILING_FACTOR_Y % LOOP_UNROLL_FACTOR_Y == 0",
165+
"LOOP_UNROLL_FACTOR_Z == 0 or TILING_FACTOR_Z % LOOP_UNROLL_FACTOR_Z == 0",
166+
f"BLOCK_SIZE_X * TILING_FACTOR_X > {cta_padding}",
167+
f"BLOCK_SIZE_Y * TILING_FACTOR_Y > {cta_padding}",
168+
f"BLOCK_SIZE_Z * TILING_FACTOR_Z > {cta_padding}",
169+
])
170+
171+
# run the benchmark and check for valid outcome and performance degradation
172+
solutions = benchmark(problem.getSolutions)
173+
reference_result = reference_results[benchmark_name]
174+
benchmark_result = benchmark.stats.stats.mean
175+
benchmark_results[benchmark_name] = benchmark_result
176+
assert len(solutions) == 138600
177+
check_benchmark_performance(benchmark_name, benchmark_result, benchmark.stats.stats.stddev)
178+
179+
180+
def test_dedispersion(benchmark):
181+
"""Based on the Dedispersion search space in the paper."""
182+
benchmark_name = "dedispersion"
183+
184+
# setup the tunable parameters
185+
problem = Problem()
186+
problem.addVariable("block_size_x", [1, 2, 4, 8] + [16 * i for i in range(1, 3)])
187+
problem.addVariable("block_size_y", [8 * i for i in range(4, 33)])
188+
problem.addVariable("block_size_z", [1])
189+
problem.addVariable("tile_size_x", [i for i in range(1, 5)])
190+
problem.addVariable("tile_size_y", [i for i in range(1, 9)])
191+
problem.addVariable("tile_stride_x", [0, 1])
192+
problem.addVariable("tile_stride_y", [0, 1])
193+
problem.addVariable("loop_unroll_factor_channel", [
194+
0
195+
])
196+
197+
# setup the restrictions
198+
check_block_size = "32 <= block_size_x * block_size_y <= 1024"
199+
check_tile_stride_x = "tile_size_x > 1 or tile_stride_x == 0"
200+
check_tile_stride_y = "tile_size_y > 1 or tile_stride_y == 0"
201+
problem.addConstraint([check_block_size, check_tile_stride_x, check_tile_stride_y])
202+
203+
# run the benchmark and check for valid outcome and performance degradation
204+
solutions = benchmark(problem.getSolutions)
205+
reference_result = reference_results[benchmark_name]
206+
benchmark_result = benchmark.stats.stats.mean
207+
benchmark_results[benchmark_name] = benchmark_result
208+
assert len(solutions) == 11130
209+
check_benchmark_performance(benchmark_name, benchmark_result, benchmark.stats.stats.stddev)
210+
211+
212+
def test_hotspot(benchmark):
213+
"""Based on the Hotspot search space in the paper."""
214+
benchmark_name = "hotspot"
215+
216+
# constants
217+
temporal_tiling_factor = [i for i in range(1, 11)]
218+
max_tfactor = max(temporal_tiling_factor)
219+
220+
# setup the tunable parameters
221+
problem = Problem()
222+
problem.addVariable("block_size_x", [1, 2, 4, 8, 16] + [32 * i for i in range(1, 33)])
223+
problem.addVariable("block_size_y", [2**i for i in range(6)])
224+
problem.addVariable("tile_size_x", [i for i in range(1, 11)])
225+
problem.addVariable("tile_size_y", [i for i in range(1, 11)])
226+
problem.addVariable("temporal_tiling_factor", temporal_tiling_factor)
227+
problem.addVariable("max_tfactor", [max_tfactor])
228+
problem.addVariable("loop_unroll_factor_t", [i for i in range(1, max_tfactor + 1)])
229+
problem.addVariable("sh_power", [0, 1])
230+
problem.addVariable("blocks_per_sm", [0, 1, 2, 3, 4])
231+
232+
# setup the restrictions
233+
problem.addConstraint([
234+
"block_size_x*block_size_y >= 32",
235+
"temporal_tiling_factor % loop_unroll_factor_t == 0",
236+
f"block_size_x*block_size_y <= {dev['max_threads']}",
237+
f"(block_size_x*tile_size_x + temporal_tiling_factor * 2) * (block_size_y*tile_size_y + temporal_tiling_factor * 2) * (2+sh_power) * 4 <= {dev['max_shared_memory_per_block']}",
238+
f"blocks_per_sm == 0 or (((block_size_x*tile_size_x + temporal_tiling_factor * 2) * (block_size_y*tile_size_y + temporal_tiling_factor * 2) * (2+sh_power) * 4) * blocks_per_sm <= {dev['max_shared_memory']})",
239+
])
240+
241+
# run the benchmark and check for valid outcome and performance degradation
242+
solutions = benchmark(problem.getSolutions)
243+
reference_result = reference_results[benchmark_name]
244+
benchmark_result = benchmark.stats.stats.mean
245+
benchmark_results[benchmark_name] = benchmark_result
246+
assert len(solutions) == 349853
247+
check_benchmark_performance(benchmark_name, benchmark_result, benchmark.stats.stats.stddev)

0 commit comments

Comments
 (0)