#include <benchmark/benchmark.h>
#include <array>
constexpr int len = 6;
// constexpr function具有inline属性,你应该把它放在头文件中
constexpr auto my_pow(const int i)
{
return i * i;
}
// 使用operator[]读取元素,依次存入1-6的平方
static void bench_array_operator(benchmark::State& state)
{
std::array<int, len> arr;
constexpr int i = 1;
for (auto _: state) {
arr[0] = my_pow(i);
arr[1] = my_pow(i+1);
arr[2] = my_pow(i+2);
arr[3] = my_pow(i+3);
arr[4] = my_pow(i+4);
arr[5] = my_pow(i+5);
}
}
BENCHMARK(bench_array_operator);
// 使用at()读取元素,依次存入1-6的平方
static void bench_array_at(benchmark::State& state)
{
std::array<int, len> arr;
constexpr int i = 1;
for (auto _: state) {
arr.at(0) = my_pow(i);
arr.at(1) = my_pow(i+1);
arr.at(2) = my_pow(i+2);
arr.at(3) = my_pow(i+3);
arr.at(4) = my_pow(i+4);
arr.at(5) = my_pow(i+5);
}
}
BENCHMARK(bench_array_at);
// std::get<>(array)是一个constexpr function,它会返回容器内元素的引用,并在编译期检查数组的索引是否正确
static void bench_array_get(benchmark::State& state)
{
std::array<int, len> arr;
constexpr int i = 1;
for (auto _: state) {
std::get<0>(arr) = my_pow(i);
std::get<1>(arr) = my_pow(i+1);
std::get<2>(arr) = my_pow(i+2);
std::get<3>(arr) = my_pow(i+3);
std::get<4>(arr) = my_pow(i+4);
std::get<5>(arr) = my_pow(i+5);
}
}
BENCHMARK(bench_array_get);
BENCHMARK_MAIN();
编译运行:
[root benchmark]#g++ -Wall -std=c++14 benchmark.cc -pthread -lbenchmark
[root benchmark]#
[root benchmark]#
[root benchmark]#./a.out
2021-07-27T12:34:08-04:00
Running ./a.out
Run on (4 X 1900 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x2)
L1 Instruction 32 KiB (x2)
L2 Unified 256 KiB (x2)
L3 Unified 3072 KiB (x1)
Load Average: 0.01, 0.07, 0.08
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
---------------------------------------------------------------
Benchmark Time CPU Iterations
---------------------------------------------------------------
bench_array_operator 148 ns 148 ns 4717911
bench_array_at 153 ns 153 ns 4567104
bench_array_get 140 ns 140 ns 4985060
这里有告警信息:
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
在stackoverflow中有方法可以关闭:
sudo cpupower frequency-set --governor performance
./a.out
sudo cpupower frequency-set --governor powersave
完整运行结果:
[root benchmark]#sudo cpupower frequency-set --governor performance
Setting cpu: 0
Setting cpu: 1
Setting cpu: 2
Setting cpu: 3
[root benchmark]#
[root benchmark]#./a.out
2021-07-27T12:35:43-04:00
Running ./a.out
Run on (4 X 1900 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x2)
L1 Instruction 32 KiB (x2)
L2 Unified 256 KiB (x2)
L3 Unified 3072 KiB (x1)
Load Average: 0.00, 0.05, 0.07
---------------------------------------------------------------
Benchmark Time CPU Iterations
---------------------------------------------------------------
bench_array_operator 62.3 ns 62.3 ns 11242599
bench_array_at 64.3 ns 64.3 ns 10895305
bench_array_get 58.9 ns 58.9 ns 11885084
[root benchmark]#
[root benchmark]#sudo cpupower frequency-set --governor powersave
Setting cpu: 0
Setting cpu: 1
Setting cpu: 2
Setting cpu: 3
[root benchmark]#
[root benchmark]#./a.out
2021-07-27T12:36:09-04:00
Running ./a.out
Run on (4 X 1900 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x2)
L1 Instruction 32 KiB (x2)
L2 Unified 256 KiB (x2)
L3 Unified 3072 KiB (x1)
Load Average: 0.00, 0.04, 0.07
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
---------------------------------------------------------------
Benchmark Time CPU Iterations
---------------------------------------------------------------
bench_array_operator 148 ns 148 ns 4714348
bench_array_at 153 ns 153 ns 4563909
bench_array_get 141 ns 141 ns 4980732
[root benchmark]#
可以看到开启性能模式后程序执行速度加快,对比开启省电模式140ns,时间提升到60ms.google microbenchmarking cpu scaling warning
另一个例子:
#include <benchmark/benchmark.h>
static void BM_StringCreation(benchmark::State& state) {
for (auto _ : state)
std::string empty_string;
}
// Register the function as a benchmark
BENCHMARK(BM_StringCreation);
// Define another benchmark
static void BM_StringCopy(benchmark::State& state) {
std::string x = "hello";
for (auto _ : state)
std::string copy(x);
}
BENCHMARK(BM_StringCopy);
BENCHMARK_MAIN();
编译运行:
[root benchmark]#g++ bmstring.cc -std=c++11 -isystem benchmark/include -Lbenchmark/build/src -lbenchmark -lpthread -o mybenchmark
[root benchmark]#
[root benchmark]#
[root benchmark]#./mybenchmark
2021-07-28T10:55:22-04:00
Running ./mybenchmark
Run on (4 X 1900 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x2)
L1 Instruction 32 KiB (x2)
L2 Unified 256 KiB (x2)
L3 Unified 3072 KiB (x1)
Load Average: 0.03, 0.03, 0.00
------------------------------------------------------------
Benchmark Time CPU Iterations
------------------------------------------------------------
BM_StringCreation 11.6 ns 11.6 ns 60144680
BM_StringCopy 34.4 ns 34.4 ns 20358772
最后我们验证vector的push_back和emplace_back的性能差异,参考C++ Diary #1 | emplace_back vs. push_back
#include <benchmark/benchmark.h>
#include <iostream>
class MyClass {
public:
MyClass(int x, int y) : x_(x), y_(y) {
//std::cout << "Create class" << std::endl;
}
~MyClass() {
//std::cout << "Destroy class" << std::endl;
}
// Copy Constructor
MyClass(const MyClass& my_class) {
//std::cout << "Copy Constructor Called" << std::endl;
x_ = my_class.x_;
}
// Move Constructor
MyClass (MyClass&& my_class) noexcept {
//std::cout << "Move Constructor Called" << std::endl;
x_ = std::move(my_class.x_);
}
private:
int x_ = 0;
int y_ = 0;
};
static void BM_vector_push_back(benchmark::State& state) {
std::vector<MyClass> vec;
for (auto _ : state)
vec.push_back(MyClass(1,2));
}
// Register the function as a benchmark
BENCHMARK(BM_vector_push_back);
// Define another benchmark
static void BM_vector_emplace_back(benchmark::State& state) {
std::vector<MyClass> vec;
for (auto _ : state)
vec.emplace_back(1,2);
}
BENCHMARK(BM_vector_emplace_back);
BENCHMARK_MAIN();
编译运行:
[root benchmark]#g++ emplace_back.cc -std=c++11 -isystem benchmark/include -Lbenchmark/build/src -lbenchmark -lpthread -o mybenchmark
[root benchmark]#
[root benchmark]#./mybenchmark
2021-07-28T11:04:53-04:00
Running ./mybenchmark
Run on (4 X 1900 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x2)
L1 Instruction 32 KiB (x2)
L2 Unified 256 KiB (x2)
L3 Unified 3072 KiB (x1)
Load Average: 0.03, 0.05, 0.00
-----------------------------------------------------------------
Benchmark Time CPU Iterations
-----------------------------------------------------------------
BM_vector_push_back 126 ns 126 ns 6198943
BM_vector_emplace_back 105 ns 105 ns 7041128
再对比下vector中使用reserve提升性能
#include <benchmark/benchmark.h>
#include <vector>
int loop = 1000;
static void bench_vector_reserve(benchmark::State& state)
{
for (auto _ : state)
{
std::vector<int> ans;
ans.reserve(loop);
for(int i = 0;i < loop;i++)
ans.push_back(i);
}
}
BENCHMARK(bench_vector_reserve);
static void bench_vector(benchmark::State& state)
{
for (auto _ : state)
{
std::vector<int> ans;
for(int i = 0;i < loop;i++)
ans.push_back(i);
}
}
BENCHMARK(bench_vector);
BENCHMARK_MAIN();
运行结果:
[root benchmark]#./a.out
2021-08-15T16:26:45+08:00
Running ./a.out
Run on (4 X 1900 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x2)
L1 Instruction 32 KiB (x2)
L2 Unified 256 KiB (x2)
L3 Unified 3072 KiB (x1)
Load Average: 0.24, 1.68, 1.63
---------------------------------------------------------------
Benchmark Time CPU Iterations
---------------------------------------------------------------
bench_vector_reserve 23651 ns 23651 ns 29597
bench_vector 29317 ns 29317 ns 24126