UnitTests/Benchmarks/BenchmarkTools.cpp

0001 // This file is part of the Acts project.
0002 //
0003 // Copyright (C) 2020 CERN for the benefit of the Acts project
0004 //
0005 // This Source Code Form is subject to the terms of the Mozilla Public
0006 // License, v. 2.0. If a copy of the MPL was not distributed with this
0007 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
0008
0009 #include <boost/test/data/test_case.hpp>
0010 #include <boost/test/unit_test.hpp>
0011
0012 #include "Acts/Tests/CommonHelpers/BenchmarkTools.hpp"
0013
0014 #include "Acts/Tests/CommonHelpers/FloatComparisons.hpp"
0015
0016 #include <cmath>
0017 #include <complex>
0018 #include <iostream>
0019 #include <sstream>
0020 #include <tuple>
0021
0022 namespace Acts::Test {
0023
0024 // Basic non-timing tests do not validate the core performance aspects of the
0025 // benchmark tools, but have the advantage of being runnable on any system.
0026 BOOST_AUTO_TEST_SUITE(benchmark_tools)
0027
0028 BOOST_AUTO_TEST_CASE(assume_accessed) {
0029   int x = 42;
0030   assumeAccessed(x);
0031   BOOST_CHECK_EQUAL(x, 42);
0032 }
0033
0034 BOOST_AUTO_TEST_CASE(assume_read) {
0035   float x = 4.2f;
0036   assumeRead(x);
0037   BOOST_CHECK_EQUAL(x, 4.2f);
0038
0039   const std::string y = "LOL";
0040   assumeRead(x);
0041   BOOST_CHECK_EQUAL(y, "LOL");
0042
0043   assumeRead(std::make_tuple(1, false, 3.5));
0044 }
0045
0046 BOOST_AUTO_TEST_CASE(assume_written) {
0047   std::complex c(1.2, 3.4);
0048   assumeWritten(c);
0049   BOOST_CHECK_EQUAL(c, std::complex(1.2, 3.4));
0050 }
0051
0052 BOOST_AUTO_TEST_CASE(micro_benchmark_result) {
0053   MicroBenchmarkResult res;
0054   res.iters_per_run = 42;
0055   res.run_timings = {
0056       std::chrono::microseconds(420), std::chrono::microseconds(21),
0057       std::chrono::milliseconds(4),   std::chrono::microseconds(84),
0058       std::chrono::microseconds(294), std::chrono::microseconds(378),
0059       std::chrono::microseconds(126), std::chrono::milliseconds(42)};
0060
0061   CHECK_CLOSE_REL(res.totalTime().count() / 1'000'000., 47.323, 1e-6);
0062
0063   const auto sorted = res.sortedRunTimes();
0064   BOOST_CHECK_EQUAL(sorted.size(), res.run_timings.size());
0065   BOOST_CHECK_EQUAL(sorted[0].count(), 21'000.);
0066   BOOST_CHECK_EQUAL(sorted[1].count(), 84'000.);
0067   BOOST_CHECK_EQUAL(sorted[2].count(), 126'000.);
0068   BOOST_CHECK_EQUAL(sorted[3].count(), 294'000.);
0069   BOOST_CHECK_EQUAL(sorted[4].count(), 378'000.);
0070   BOOST_CHECK_EQUAL(sorted[5].count(), 420'000.);
0071   BOOST_CHECK_EQUAL(sorted[6].count(), 4'000'000.);
0072   BOOST_CHECK_EQUAL(sorted[7].count(), 42'000'000.);
0073
0074   CHECK_CLOSE_REL(res.runTimeMedian().count() / 1000., (294. + 378.) / 2.,
0075                   1e-6);
0076
0077   const auto [firstq, thirdq] = res.runTimeQuartiles();
0078   CHECK_CLOSE_REL(firstq.count() / 1000., (84. + 126.) / 2., 1e-6);
0079   CHECK_CLOSE_REL(thirdq.count() / 1000., (420. + 4000.) / 2., 1e-6);
0080
0081   const auto robustRTStddev = res.runTimeRobustStddev();
0082   CHECK_CLOSE_REL(robustRTStddev.count(), (thirdq - firstq).count() / 1.349,
0083                   1e-3);
0084
0085   const auto runTimeError = res.runTimeError();
0086   CHECK_CLOSE_REL(
0087       runTimeError.count(),
0088       1.2533 * robustRTStddev.count() / std::sqrt(res.run_timings.size()),
0089       1e-3);
0090
0091   CHECK_CLOSE_REL(res.iterTimeAverage().count(),
0092                   res.runTimeMedian().count() / res.iters_per_run, 1e-6);
0093
0094   CHECK_CLOSE_REL(res.iterTimeError().count(),
0095                   runTimeError.count() / std::sqrt(res.iters_per_run), 1e-6);
0096
0097   std::ostringstream os;
0098   os << res;
0099   BOOST_CHECK_EQUAL(os.str(),
0100                     "8 runs of 42 iteration(s), 47.3ms total, "
0101                     "336.0000+/-1355.2296µs per run, "
0102                     "8000.000+/-209116.462ns per iteration");
0103 }
0104
0105 BOOST_AUTO_TEST_CASE(micro_benchmark) {
0106   int counter = 0;
0107   microBenchmark([&] { ++counter; }, 15, 7, std::chrono::milliseconds(0));
0108   BOOST_CHECK_EQUAL(counter, 15 * 7);
0109
0110   counter = 0;
0111   microBenchmark(
0112       [&] {
0113         ++counter;
0114         return counter;
0115       },
0116       17, 11, std::chrono::milliseconds(0));
0117   BOOST_CHECK_EQUAL(counter, 17 * 11);
0118
0119   counter = 0;
0120   int previous = 64;
0121   std::vector<int> ints{1, 2, 4, 8, 16, 32, 64};
0122   microBenchmark(
0123       [&](int input) {
0124         if (input == 1) {
0125           BOOST_CHECK_EQUAL(previous, 64);
0126           counter = 1;
0127         } else {
0128           BOOST_CHECK_EQUAL(input, previous * 2);
0129           counter += input;
0130         }
0131         previous = input;
0132       },
0133       ints, 123, std::chrono::milliseconds(3));
0134   BOOST_CHECK_EQUAL(counter, 127);
0135
0136   counter = 0;
0137   previous = -81;
0138   std::vector<char> chars{-1, 3, -9, 27, -81};
0139   microBenchmark(
0140       [&](int input) {
0141         if (input == -1) {
0142           BOOST_CHECK_EQUAL(previous, -81);
0143           counter = -1;
0144         } else {
0145           BOOST_CHECK_EQUAL(input, -previous * 3);
0146           counter += input;
0147         }
0148         previous = input;
0149         return &previous;
0150       },
0151       chars, 456, std::chrono::milliseconds(8));
0152   BOOST_CHECK_EQUAL(counter, -61);
0153 }
0154
0155 BOOST_AUTO_TEST_SUITE_END()
0156
0157 // Timing tests are perhaps the most important ones for validation of
0158 // benchmarking tools, but they cannot be run by default for two reasons:
0159 // - They take a while to run, and therefore slow down the testing cycle
0160 // - They require a quiet system to succeed, and will likely fail when invoked
0161 //   by a parallel run of CTest or when run on a continuous integration VM.
0162 //
0163 // If you can ensure both of these preconditions, you can run the test with
0164 // ./BenchmarkTools --run_test=benchmark_timings
0165 BOOST_AUTO_TEST_SUITE(benchmark_timings, *boost::unit_test::disabled())
0166
0167 constexpr std::size_t bench_iters = 1'000;
0168
0169 BOOST_AUTO_TEST_CASE(micro_benchmark) {
0170   using namespace std::literals::chrono_literals;
0171
0172   // For simple microbenchmarking needs, plain use of microBenchmark is enough.
0173   //
0174   // For example, here, the microbenchmark loop isn't optimized out even though
0175   // each iteration does literally nothing. If it were optimized out, the time
0176   // per iteration would change, since we wouldn't get linear scaling anymore.
0177   auto nop = [] {};
0178   const auto nop_x10 = microBenchmark(nop, 10 * bench_iters);
0179   std::cout << "nop (10x iters): " << nop_x10 << std::endl;
0180   const auto nop_x100 = microBenchmark(nop, 100 * bench_iters);
0181   std::cout << "nop (100x iters): " << nop_x100 << std::endl;
0182   const double nop_x10_iter_ns = nop_x10.iterTimeAverage().count();
0183   const double nop_x100_iter_ns = nop_x100.iterTimeAverage().count();
0184   CHECK_CLOSE_REL(nop_x10_iter_ns, nop_x100_iter_ns, 0.1);
0185
0186 // These tests reason about the performance characteristics of _optimized_ code,
0187 // and should therefore be compiled out of debug/coverage builds.
0188 #ifdef __OPTIMIZE__
0189   // The microbenchmarking harness is super low overhead, less than 1
0190   // nanosecond per iteration on a modern CPU.
0191   BOOST_CHECK_LT(nop_x100_iter_ns, 1.0);
0192
0193   // With a well-chosen iteration count that keeps per-run times under the OS
0194   // scheduling quantum (typically 1ms), the noise is also super low.
0195   BOOST_CHECK_LT(nop_x100.iterTimeError().count(), 0.1);
0196
0197   // You can measure the overhead of any operation as long as it's not
0198   // _obnoxiously_ amenable to compiler const-propagation or dead code
0199   // elimination. For example, this sqrt throughput microbenchmark works,
0200   // because microBenchmark forces the compiler to assume that "x", "y" and "z"
0201   // are modified on every benchmark iteration...
0202   const double x = 1.2, y = 3.4, z = 5.6;
0203   auto sqrt = microBenchmark(
0204       [&] { return std::sqrt(x * y) + std::sqrt(y * z) + std::sqrt(z * x); },
0205       bench_iters);
0206   std::cout << "sqrt (correct): " << sqrt << std::endl;
0207   BOOST_CHECK_GT(sqrt.iterTimeAverage().count(), 10. * nop_x100_iter_ns);
0208
0209   // ...but this variant doesn't work, because the compiler can trivially
0210   // precompute the square root when optimizing the inner lambda...
0211   const auto sqrt_constprop = microBenchmark(
0212       [] {
0213         return std::sqrt(1.2 * 3.4) + std::sqrt(3.4 * 5.6) +
0214                std::sqrt(5.6 * 1.2);
0215       },
0216       bench_iters * 20);
0217   std::cout << "sqrt (constprop'd): " << sqrt_constprop << std::endl;
0218   BOOST_CHECK_LT(sqrt_constprop.iterTimeAverage().count(),
0219                  sqrt.iterTimeAverage().count() / 5.);
0220
0221   // ...and this one doesn't work either, because the compiler can trivially
0222   // infer that the result of the computation is unused and stop computing it.
0223   //
0224   // The lower tolerance of this test is needed because current GCC doesn't
0225   // optimize _everything_ out in its default configuration, as sqrt could still
0226   // have side-effects like setting the errno thread-local variable...
0227   const auto sqrt_deadcode = microBenchmark(
0228       [&] { (void)(std::sqrt(x * y) + std::sqrt(y * z) + std::sqrt(z * x)); },
0229       bench_iters * 10);
0230   std::cout << "sqrt (deadcode'd): " << sqrt_deadcode << std::endl;
0231   BOOST_CHECK_LT(sqrt_deadcode.iterTimeAverage().count(),
0232                  sqrt.iterTimeAverage().count() / 3.);
0233 #endif
0234 }
0235
0236 // These tests reason about the performance characteristics of _optimized_ code,
0237 // and should therefore be compiled out of debug/coverage builds.
0238 #ifdef __OPTIMIZE__
0239 BOOST_AUTO_TEST_CASE(assume_read) {
0240   // You can use assumeRead when you want the compiler to assume that the result
0241   // of some computation has been read and therefore the computation shouldn't
0242   // be optimized out. This is what microBenchmark implicitly does to the value
0243   // returned by the benchmark iteration function, if any.
0244   //
0245   // For example, these two computations are almost equivalent. Notice that
0246   // assumeRead can be used on temporaries.
0247   const double x = 1.2, y = 3.4, z = 5.6;
0248   const auto tuple_return = microBenchmark(
0249       [&] {
0250         return std::make_tuple(
0251             std::sqrt(x * y), std::complex(std::sqrt(y * z), std::sqrt(z * x)));
0252       },
0253       bench_iters);
0254   std::cout << "tuple return: " << tuple_return << std::endl;
0255   const auto assumeread = microBenchmark(
0256       [&] {
0257         assumeRead(std::sqrt(x * y));
0258         assumeRead(std::complex(std::sqrt(y * z), std::sqrt(z * x)));
0259       },
0260       bench_iters);
0261   std::cout << "assumeRead: " << assumeread << std::endl;
0262   const double tuple_return_iter_ns = tuple_return.iterTimeAverage().count();
0263   const double assumeRead_iter_ns = assumeread.iterTimeAverage().count();
0264   CHECK_CLOSE_REL(tuple_return_iter_ns, assumeRead_iter_ns, 1e-2);
0265 }
0266 #endif
0267
0268 BOOST_AUTO_TEST_CASE(assume_written) {
0269   // You can use assumeWritten when you want the compiler to assume that some
0270   // variables have been written to, and every dependent computation must
0271   // therefore be recomputed. This is what microBenchmark implicitly does to
0272   // every variable captured by the benchmark iteration lambda.
0273   //
0274   // Since assumeWritten operates on variables in memory, it cannot be used on
0275   // temporaries, but only on mutable variables.
0276   double x = 1.2, y = 3.4, z = 5.6;
0277   auto sqrt_sum = microBenchmark(
0278       [&] { return std::sqrt(x * y) + std::sqrt(y * z) + std::sqrt(z * x); },
0279       bench_iters);
0280   std::cout << "sqrt sum: " << sqrt_sum << std::endl;
0281   auto sqrt_2sums = microBenchmark(
0282       [&] {
0283         double tmp = std::sqrt(x * y) + std::sqrt(y * z) + std::sqrt(z * x);
0284         assumeWritten(x);
0285         assumeWritten(y);
0286         assumeWritten(z);
0287         return tmp + std::sqrt(x * y) + std::sqrt(y * z) + std::sqrt(z * x);
0288       },
0289       bench_iters);
0290   std::cout << "2x(sqrt sum): " << sqrt_2sums << std::endl;
0291   const double sqrt_sum_iter_ns = sqrt_sum.iterTimeAverage().count();
0292   const double sqrt_2sums_iter_ns = sqrt_2sums.iterTimeAverage().count();
0293   CHECK_CLOSE_REL(2. * sqrt_sum_iter_ns, sqrt_2sums_iter_ns, 1e-2);
0294 }
0295
0296 BOOST_AUTO_TEST_SUITE_END()
0297
0298 }  // namespace Acts::Test