google benchmark学习与使用

google benchmark

#include <benchmark/benchmark.h>
#include <array>

constexpr int len = 6;

// constexpr function具有inline属性，你应该把它放在头文件中
constexpr auto my_pow(const int i)
{
    return i * i;
}

// 使用operator[]读取元素，依次存入1-6的平方
static void bench_array_operator(benchmark::State& state)
{
    std::array<int, len> arr;
    constexpr int i = 1;
    for (auto _: state) {
        arr[0] = my_pow(i);
        arr[1] = my_pow(i+1);
        arr[2] = my_pow(i+2);
        arr[3] = my_pow(i+3);
        arr[4] = my_pow(i+4);
        arr[5] = my_pow(i+5);
    }
}
BENCHMARK(bench_array_operator);

// 使用at()读取元素，依次存入1-6的平方
static void bench_array_at(benchmark::State& state)
{
    std::array<int, len> arr;
    constexpr int i = 1;
    for (auto _: state) {
        arr.at(0) = my_pow(i);
        arr.at(1) = my_pow(i+1);
        arr.at(2) = my_pow(i+2);
        arr.at(3) = my_pow(i+3);
        arr.at(4) = my_pow(i+4);
        arr.at(5) = my_pow(i+5);
    }
}
BENCHMARK(bench_array_at);

// std::get<>(array)是一个constexpr function，它会返回容器内元素的引用，并在编译期检查数组的索引是否正确
static void bench_array_get(benchmark::State& state)
{
    std::array<int, len> arr;
    constexpr int i = 1;
    for (auto _: state) {
        std::get<0>(arr) = my_pow(i);
        std::get<1>(arr) = my_pow(i+1);
        std::get<2>(arr) = my_pow(i+2);
        std::get<3>(arr) = my_pow(i+3);
        std::get<4>(arr) = my_pow(i+4);
        std::get<5>(arr) = my_pow(i+5);
    }
}
BENCHMARK(bench_array_get);

BENCHMARK_MAIN();

编译运行：

[root benchmark]#g++ -Wall -std=c++14 benchmark.cc -pthread -lbenchmark
[root benchmark]#
[root benchmark]#
[root benchmark]#./a.out
2021-07-27T12:34:08-04:00
Running ./a.out
Run on (4 X 1900 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x2)
  L1 Instruction 32 KiB (x2)
  L2 Unified 256 KiB (x2)
  L3 Unified 3072 KiB (x1)
Load Average: 0.01, 0.07, 0.08
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
---------------------------------------------------------------
Benchmark                     Time             CPU   Iterations
---------------------------------------------------------------
bench_array_operator        148 ns          148 ns      4717911
bench_array_at              153 ns          153 ns      4567104
bench_array_get             140 ns          140 ns      4985060

这里有告警信息：

***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.

在stackoverflow中有方法可以关闭：

sudo cpupower frequency-set --governor performance
./a.out
sudo cpupower frequency-set --governor powersave

完整运行结果：

[root benchmark]#sudo cpupower frequency-set --governor performance
Setting cpu: 0
Setting cpu: 1
Setting cpu: 2
Setting cpu: 3
[root benchmark]#
[root benchmark]#./a.out
2021-07-27T12:35:43-04:00
Running ./a.out
Run on (4 X 1900 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x2)
  L1 Instruction 32 KiB (x2)
  L2 Unified 256 KiB (x2)
  L3 Unified 3072 KiB (x1)
Load Average: 0.00, 0.05, 0.07
---------------------------------------------------------------
Benchmark                     Time             CPU   Iterations
---------------------------------------------------------------
bench_array_operator       62.3 ns         62.3 ns     11242599
bench_array_at             64.3 ns         64.3 ns     10895305
bench_array_get            58.9 ns         58.9 ns     11885084
[root benchmark]#
[root benchmark]#sudo cpupower frequency-set --governor powersave
Setting cpu: 0
Setting cpu: 1
Setting cpu: 2
Setting cpu: 3
[root benchmark]#
[root benchmark]#./a.out
2021-07-27T12:36:09-04:00
Running ./a.out
Run on (4 X 1900 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x2)
  L1 Instruction 32 KiB (x2)
  L2 Unified 256 KiB (x2)
  L3 Unified 3072 KiB (x1)
Load Average: 0.00, 0.04, 0.07
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
---------------------------------------------------------------
Benchmark                     Time             CPU   Iterations
---------------------------------------------------------------
bench_array_operator        148 ns          148 ns      4714348
bench_array_at              153 ns          153 ns      4563909
bench_array_get             141 ns          141 ns      4980732
[root benchmark]#

可以看到开启性能模式后程序执行速度加快，对比开启省电模式140ns，时间提升到60ms.google microbenchmarking cpu scaling warning

另一个例子：

#include <benchmark/benchmark.h>

static void BM_StringCreation(benchmark::State& state) {
  for (auto _ : state)
    std::string empty_string;
}
// Register the function as a benchmark
BENCHMARK(BM_StringCreation);

// Define another benchmark
static void BM_StringCopy(benchmark::State& state) {
  std::string x = "hello";
  for (auto _ : state)
    std::string copy(x);
}
BENCHMARK(BM_StringCopy);

BENCHMARK_MAIN();

编译运行：

[root benchmark]#g++ bmstring.cc -std=c++11 -isystem benchmark/include  -Lbenchmark/build/src -lbenchmark -lpthread -o mybenchmark
[root benchmark]#
[root benchmark]#
[root benchmark]#./mybenchmark
2021-07-28T10:55:22-04:00
Running ./mybenchmark
Run on (4 X 1900 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x2)
  L1 Instruction 32 KiB (x2)
  L2 Unified 256 KiB (x2)
  L3 Unified 3072 KiB (x1)
Load Average: 0.03, 0.03, 0.00
------------------------------------------------------------
Benchmark                  Time             CPU   Iterations
------------------------------------------------------------
BM_StringCreation       11.6 ns         11.6 ns     60144680
BM_StringCopy           34.4 ns         34.4 ns     20358772

最后我们验证vector的push_back和emplace_back的性能差异，参考C++ Diary #1 | emplace_back vs. push_back

#include <benchmark/benchmark.h>
#include <iostream>

class MyClass {
public:
    MyClass(int x, int y) : x_(x), y_(y) {
        //std::cout << "Create class" << std::endl;
    }

    ~MyClass() {
        //std::cout << "Destroy class" << std::endl;
    }

    // Copy Constructor
    MyClass(const MyClass& my_class) {
        //std::cout << "Copy Constructor Called" << std::endl;
        x_ = my_class.x_;
    }

    // Move Constructor
    MyClass (MyClass&& my_class) noexcept {
        //std::cout << "Move Constructor Called" << std::endl;
        x_ = std::move(my_class.x_);
    }

private:
    int x_ = 0;
    int y_ = 0;

};


static void BM_vector_push_back(benchmark::State& state) {
  std::vector<MyClass> vec;
  for (auto _ : state)
    vec.push_back(MyClass(1,2));
}
// Register the function as a benchmark
BENCHMARK(BM_vector_push_back);

// Define another benchmark
static void BM_vector_emplace_back(benchmark::State& state) {
  std::vector<MyClass> vec;
  for (auto _ : state)
    vec.emplace_back(1,2);
}
BENCHMARK(BM_vector_emplace_back);

BENCHMARK_MAIN();

编译运行：

[root benchmark]#g++ emplace_back.cc -std=c++11 -isystem benchmark/include  -Lbenchmark/build/src -lbenchmark -lpthread -o mybenchmark
[root benchmark]#
[root benchmark]#./mybenchmark
2021-07-28T11:04:53-04:00
Running ./mybenchmark
Run on (4 X 1900 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x2)
  L1 Instruction 32 KiB (x2)
  L2 Unified 256 KiB (x2)
  L3 Unified 3072 KiB (x1)
Load Average: 0.03, 0.05, 0.00
-----------------------------------------------------------------
Benchmark                       Time             CPU   Iterations
-----------------------------------------------------------------
BM_vector_push_back           126 ns          126 ns      6198943
BM_vector_emplace_back        105 ns          105 ns      7041128

再对比下vector中使用reserve提升性能

#include <benchmark/benchmark.h>
#include <vector>

int loop = 1000;
static void bench_vector_reserve(benchmark::State& state)
{
    for (auto _ : state)
    {
        std::vector<int> ans;
        ans.reserve(loop);
        for(int i = 0;i < loop;i++)
           ans.push_back(i);
    }
}
BENCHMARK(bench_vector_reserve);

static void bench_vector(benchmark::State& state)
{
    for (auto _ : state)
    {
        std::vector<int> ans;
        for(int i = 0;i < loop;i++)
            ans.push_back(i);
    }
}
BENCHMARK(bench_vector);

BENCHMARK_MAIN();

运行结果：

[root benchmark]#./a.out
2021-08-15T16:26:45+08:00
Running ./a.out
Run on (4 X 1900 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x2)
  L1 Instruction 32 KiB (x2)
  L2 Unified 256 KiB (x2)
  L3 Unified 3072 KiB (x1)
Load Average: 0.24, 1.68, 1.63
---------------------------------------------------------------
Benchmark                     Time             CPU   Iterations
---------------------------------------------------------------
bench_vector_reserve      23651 ns        23651 ns        29597
bench_vector              29317 ns        29317 ns        24126

52coder

google benchmark学习与使用