diff --git a/bench/tensors/benchmark.h b/bench/tensors/benchmark.h index d8b4fd4c6..2c06075e0 100644 --- a/bench/tensors/benchmark.h +++ b/bench/tensors/benchmark.h @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include #include #include @@ -45,4 +46,5 @@ void StopBenchmarkTiming(); void StartBenchmarkTiming(); #define BENCHMARK(f) \ static ::testing::Benchmark* _benchmark_##f __attribute__((unused)) = \ - (new ::testing::Benchmark(#f, f)) \ No newline at end of file + (new ::testing::Benchmark(#f, f)) + diff --git a/bench/tensors/benchmark_main.cc b/bench/tensors/benchmark_main.cc index 0fc12960e..b2f457c96 100644 --- a/bench/tensors/benchmark_main.cc +++ b/bench/tensors/benchmark_main.cc @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -27,8 +28,14 @@ static int64_t g_benchmark_total_time_ns; static int64_t g_benchmark_start_time_ns; typedef std::map BenchmarkMap; typedef BenchmarkMap::iterator BenchmarkMapIt; -static BenchmarkMap g_benchmarks; + +BenchmarkMap& gBenchmarks() { + static BenchmarkMap g_benchmarks; + return g_benchmarks; +} + static int g_name_column_width = 20; + static int Round(int n) { int base = 1; while (base*10 < n) { @@ -101,7 +108,7 @@ void Benchmark::Register(const char* name, void (*fn)(int), void (*fn_range)(int fprintf(stderr, "%s: missing function\n", name_); exit(EXIT_FAILURE); } - g_benchmarks.insert(std::make_pair(name, this)); + gBenchmarks().insert(std::make_pair(name, this)); } void Benchmark::Run() { if (fn_ != NULL) { @@ -183,16 +190,16 @@ void StartBenchmarkTiming() { } } int main(int argc, char* argv[]) { - if (g_benchmarks.empty()) { + if (gBenchmarks().empty()) { fprintf(stderr, "No benchmarks registered!\n"); exit(EXIT_FAILURE); } - for (BenchmarkMapIt it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) { + for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) { int name_width = static_cast(strlen(it->second->Name())); g_name_column_width = std::max(g_name_column_width, name_width); } bool need_header = true; - for (BenchmarkMapIt it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) { + for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) { ::testing::Benchmark* b = it->second; if (b->ShouldRun(argc, argv)) { if (need_header) { @@ -206,10 +213,10 @@ int main(int argc, char* argv[]) { if (need_header) { fprintf(stderr, "No matching benchmarks!\n"); fprintf(stderr, "Available benchmarks:\n"); - for (BenchmarkMapIt it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) { + for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) { fprintf(stderr, " %s\n", it->second->Name()); } exit(EXIT_FAILURE); } return 0; -} \ No newline at end of file +} diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index a1696afda..071326aa7 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -10,13 +10,6 @@ typedef int TensorIndex; #define BENCHMARK_RANGE(bench, lo, hi) \ BENCHMARK(bench)->Range(lo, hi) -template -std::string StrCat(const Args... args) { - std::stringstream ss; - StrCatRecursive(ss, args...); - return ss.str(); -} - using Eigen::Tensor; using Eigen::TensorMap; @@ -305,9 +298,9 @@ template class BenchmarkSuite { } - size_t m_; - size_t k_; - size_t n_; + TensorIndex m_; + TensorIndex k_; + TensorIndex n_; float* a_; float* b_; float* c_; diff --git a/bench/tensors/tensor_benchmarks_gpu.cc b/bench/tensors/tensor_benchmarks_gpu.cu similarity index 81% rename from bench/tensors/tensor_benchmarks_gpu.cc rename to bench/tensors/tensor_benchmarks_gpu.cu index 9fe8f84d9..fbb486efd 100644 --- a/bench/tensors/tensor_benchmarks_gpu.cc +++ b/bench/tensors/tensor_benchmarks_gpu.cu @@ -10,13 +10,11 @@ #define BM_FuncGPU(FUNC) \ static void BM_##FUNC(int iters, int N) { \ StopBenchmarkTiming(); \ - cudaStream_t stream; \ - cudaStreamCreate(&stream); \ + Eigen::CudaStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ BenchmarkSuite suite(device, N); \ cudaDeviceSynchronize(); \ suite.FUNC(iters); \ - cudaStreamDestroy(stream); \ } \ BENCHMARK_RANGE(BM_##FUNC, 10, 5000); @@ -35,13 +33,11 @@ BM_FuncGPU(reduction); #define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \ static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \ StopBenchmarkTiming(); \ - cudaStream_t stream; \ - cudaStreamCreate(&stream); \ + Eigen::CudaStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ BenchmarkSuite suite(device, D1, D2, D3); \ cudaDeviceSynchronize(); \ suite.FUNC(iters); \ - cudaStreamDestroy(stream); \ } \ BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000); @@ -55,13 +51,11 @@ BM_FuncWithInputDimsGPU(contraction, N, 64, N); #define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \ StopBenchmarkTiming(); \ - cudaStream_t stream; \ - cudaStreamCreate(&stream); \ + Eigen::CudaStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ BenchmarkSuite suite(device, N); \ cudaDeviceSynchronize(); \ suite.FUNC(iters, DIM1, DIM2); \ - cudaStreamDestroy(stream); \ } \ BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);