Merged in jiayq/eigen (pull request PR-159)

Modifications to the tensor benchmarks to allow compilation in a standalone fashion.
2025-02-11 18:00:51 +08:00 · 2016-01-28 11:28:55 -08:00 · 2016-01-28 11:28:55 -08:00 · 12f8bd12a2
commit 12f8bd12a2
parent f50bb1e6f3 270c4e1ecd
5 changed files with 326 additions and 68 deletions
--- a/bench/tensors/benchmark.h
+++ b/bench/tensors/benchmark.h
@ -0,0 +1,50 @@
 /*
 * Copyright (C) 2012 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <stddef.h>
 #include <stdint.h>
 #include <vector>
 namespace testing {
 class Benchmark {
 public:
  Benchmark(const char* name, void (*fn)(int)) {
    Register(name, fn, NULL);
  }
  Benchmark(const char* name, void (*fn_range)(int, int)) {
    Register(name, NULL, fn_range);
  }
  Benchmark* Arg(int x);
  Benchmark* Range(int lo, int hi);
  const char* Name();
  bool ShouldRun(int argc, char* argv[]);
  void Run();
 private:
  const char* name_;
  void (*fn_)(int);
  void (*fn_range_)(int, int);
  std::vector<int> args_;
  void Register(const char* name, void (*fn)(int), void (*fn_range)(int, int));
  void RunRepeatedlyWithArg(int iterations, int arg);
  void RunWithArg(int arg);
 };
 }  // namespace testing
 void SetBenchmarkBytesProcessed(int64_t);
 void StopBenchmarkTiming();
 void StartBenchmarkTiming();
 #define BENCHMARK(f) \
    static ::testing::Benchmark* _benchmark_##f __attribute__((unused)) = \
        (new ::testing::Benchmark(#f, f))
--- a/bench/tensors/benchmark_main.cc
+++ b/bench/tensors/benchmark_main.cc
@ -0,0 +1,222 @@
 /*
 * Copyright (C) 2012 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "benchmark.h"
 #include <regex.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <string>
 #include <inttypes.h>
 #include <time.h>
 #include <map>
 static int64_t g_bytes_processed;
 static int64_t g_benchmark_total_time_ns;
 static int64_t g_benchmark_start_time_ns;
 typedef std::map<std::string, ::testing::Benchmark*> BenchmarkMap;
 typedef BenchmarkMap::iterator BenchmarkMapIt;
 BenchmarkMap& gBenchmarks() {
  static BenchmarkMap g_benchmarks;
  return g_benchmarks;
 }
 static int g_name_column_width = 20;
 static int Round(int n) {
  int base = 1;
  while (base*10 < n) {
    base *= 10;
  }
  if (n < 2*base) {
    return 2*base;
  }
  if (n < 5*base) {
    return 5*base;
  }
  return 10*base;
 }
 static int64_t NanoTime() {
  struct timespec t;
  t.tv_sec = t.tv_nsec = 0;
  clock_gettime(CLOCK_MONOTONIC, &t);
  return static_cast<int64_t>(t.tv_sec) * 1000000000LL + t.tv_nsec;
 }
 namespace testing {
 Benchmark* Benchmark::Arg(int arg) {
  args_.push_back(arg);
  return this;
 }
 Benchmark* Benchmark::Range(int lo, int hi) {
  const int kRangeMultiplier = 8;
  if (hi < lo) {
    int temp = hi;
    hi = lo;
    lo = temp;
  }
  while (lo < hi) {
    args_.push_back(lo);
    lo *= kRangeMultiplier;
  }
  // We always run the hi number.
  args_.push_back(hi);
  return this;
 }
 const char* Benchmark::Name() {
  return name_;
 }
 bool Benchmark::ShouldRun(int argc, char* argv[]) {
  if (argc == 1) {
    return true;  // With no arguments, we run all benchmarks.
  }
  // Otherwise, we interpret each argument as a regular expression and
  // see if any of our benchmarks match.
  for (int i = 1; i < argc; i++) {
    regex_t re;
    if (regcomp(&re, argv[i], 0) != 0) {
      fprintf(stderr, "couldn't compile \"%s\" as a regular expression!\n", argv[i]);
      exit(EXIT_FAILURE);
    }
    int match = regexec(&re, name_, 0, NULL, 0);
    regfree(&re);
    if (match != REG_NOMATCH) {
      return true;
    }
  }
  return false;
 }
 void Benchmark::Register(const char* name, void (*fn)(int), void (*fn_range)(int, int)) {
  name_ = name;
  fn_ = fn;
  fn_range_ = fn_range;
  if (fn_ == NULL && fn_range_ == NULL) {
    fprintf(stderr, "%s: missing function\n", name_);
    exit(EXIT_FAILURE);
  }
  gBenchmarks().insert(std::make_pair(name, this));
 }
 void Benchmark::Run() {
  if (fn_ != NULL) {
    RunWithArg(0);
  } else {
    if (args_.empty()) {
      fprintf(stderr, "%s: no args!\n", name_);
      exit(EXIT_FAILURE);
    }
    for (size_t i = 0; i < args_.size(); ++i) {
      RunWithArg(args_[i]);
    }
  }
 }
 void Benchmark::RunRepeatedlyWithArg(int iterations, int arg) {
  g_bytes_processed = 0;
  g_benchmark_total_time_ns = 0;
  g_benchmark_start_time_ns = NanoTime();
  if (fn_ != NULL) {
    fn_(iterations);
  } else {
    fn_range_(iterations, arg);
  }
  if (g_benchmark_start_time_ns != 0) {
    g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns;
  }
 }
 void Benchmark::RunWithArg(int arg) {
  // run once in case it's expensive
  int iterations = 1;
  RunRepeatedlyWithArg(iterations, arg);
  while (g_benchmark_total_time_ns < 1e9 && iterations < 1e9) {
    int last = iterations;
    if (g_benchmark_total_time_ns/iterations == 0) {
      iterations = 1e9;
    } else {
      iterations = 1e9 / (g_benchmark_total_time_ns/iterations);
    }
    iterations = std::max(last + 1, std::min(iterations + iterations/2, 100*last));
    iterations = Round(iterations);
    RunRepeatedlyWithArg(iterations, arg);
  }
  char throughput[100];
  throughput[0] = '\0';
  if (g_benchmark_total_time_ns > 0 && g_bytes_processed > 0) {
    double mib_processed = static_cast<double>(g_bytes_processed)/1e6;
    double seconds = static_cast<double>(g_benchmark_total_time_ns)/1e9;
    snprintf(throughput, sizeof(throughput), " %8.2f MiB/s", mib_processed/seconds);
  }
  char full_name[100];
  if (fn_range_ != NULL) {
    if (arg >= (1<<20)) {
      snprintf(full_name, sizeof(full_name), "%s/%dM", name_, arg/(1<<20));
    } else if (arg >= (1<<10)) {
      snprintf(full_name, sizeof(full_name), "%s/%dK", name_, arg/(1<<10));
    } else {
      snprintf(full_name, sizeof(full_name), "%s/%d", name_, arg);
    }
  } else {
    snprintf(full_name, sizeof(full_name), "%s", name_);
  }
  printf("%-*s %10d %10" PRId64 "%s\n", g_name_column_width, full_name,
         iterations, g_benchmark_total_time_ns/iterations, throughput);
  fflush(stdout);
 }
 }  // namespace testing
 void SetBenchmarkBytesProcessed(int64_t x) {
  g_bytes_processed = x;
 }
 void StopBenchmarkTiming() {
  if (g_benchmark_start_time_ns != 0) {
    g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns;
  }
  g_benchmark_start_time_ns = 0;
 }
 void StartBenchmarkTiming() {
  if (g_benchmark_start_time_ns == 0) {
    g_benchmark_start_time_ns = NanoTime();
  }
 }
 int main(int argc, char* argv[]) {
  if (gBenchmarks().empty()) {
    fprintf(stderr, "No benchmarks registered!\n");
    exit(EXIT_FAILURE);
  }
  for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
    int name_width = static_cast<int>(strlen(it->second->Name()));
    g_name_column_width = std::max(g_name_column_width, name_width);
  }
  bool need_header = true;
  for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
    ::testing::Benchmark* b = it->second;
    if (b->ShouldRun(argc, argv)) {
      if (need_header) {
        printf("%-*s %10s %10s\n", g_name_column_width, "", "iterations", "ns/op");
        fflush(stdout);
        need_header = false;
      }
      b->Run();
    }
  }
  if (need_header) {
    fprintf(stderr, "No matching benchmarks!\n");
    fprintf(stderr, "Available benchmarks:\n");
    for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
      fprintf(stderr, "  %s\n", it->second->Name());
    }
    exit(EXIT_FAILURE);
  }
  return 0;
 }
--- a/bench/tensors/tensor_benchmarks.h
+++ b/bench/tensors/tensor_benchmarks.h
@ -4,12 +4,16 @@
 typedef int TensorIndex;
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "unsupported/Eigen/CXX11/Tensor"
-#include "testing/base/public/benchmark.h"
+#include "benchmark.h"
 #define BENCHMARK_RANGE(bench, lo, hi) \
  BENCHMARK(bench)->Range(lo, hi)
 using Eigen::Tensor;
 using Eigen::TensorMap;
 typedef int64_t int64;
 // TODO(bsteiner): also templatize on the input type since we have users
 // for int8 as well as floats.
@ -43,7 +47,7 @@ template <typename Device> class BenchmarkSuite {
  void random(int num_iters) {
    eigen_assert(m_ == k_ && k_ == n_);
-    const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+    const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
    StartBenchmarkTiming();
@ -56,16 +60,16 @@ template <typename Device> class BenchmarkSuite {
  void slicing(int num_iters) {
    eigen_assert(m_ == k_ && k_ == n_);
-    const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+    const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
-    const Eigen::DSizes<TensorIndex, 2> quarter_sizes(Eigen::array<TensorIndex, 2>(m_/2, m_/2));
+    const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2);
-    const Eigen::DSizes<TensorIndex, 2> first_quadrant(Eigen::array<TensorIndex, 2>(0, 0));
+    const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0);
-    const Eigen::DSizes<TensorIndex, 2> second_quadrant(Eigen::array<TensorIndex, 2>(0, m_/2));
+    const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2);
-    const Eigen::DSizes<TensorIndex, 2> third_quadrant(Eigen::array<TensorIndex, 2>(m_/2, 0));
+    const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0);
-    const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(Eigen::array<TensorIndex, 2>(m_/2, m_/2));
+    const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2);
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
@ -85,12 +89,12 @@ template <typename Device> class BenchmarkSuite {
  void shuffling(int num_iters) {
    eigen_assert(m_ == n_);
-    const Eigen::array<TensorIndex, 2> size_a(m_, k_);
+    const Eigen::array<TensorIndex, 2> size_a = {{m_, k_}};
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
-    const Eigen::array<TensorIndex, 2> size_b(k_, m_);
+    const Eigen::array<TensorIndex, 2> size_b = {{k_, m_}};
    TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
-    const Eigen::array<int, 2> shuffle(1, 0);
+    const Eigen::array<int, 2> shuffle = {{1, 0}};
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
@ -102,9 +106,9 @@ template <typename Device> class BenchmarkSuite {
 void padding(int num_iters) {
    eigen_assert(m_ == k_);
-    const Eigen::array<TensorIndex, 2> size_a(m_, k_-3);
+    const Eigen::array<TensorIndex, 2> size_a = {{m_, k_-3}};
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
-    const Eigen::array<TensorIndex, 2> size_b(k_, m_);
+    const Eigen::array<TensorIndex, 2> size_b = {{k_, m_}};
    TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
    Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
@ -121,12 +125,12 @@ template <typename Device> class BenchmarkSuite {
 void striding(int num_iters) {
    eigen_assert(m_ == k_);
-    const Eigen::array<TensorIndex, 2> size_a(m_, k_);
+    const Eigen::array<TensorIndex, 2> size_a = {{m_, k_}};
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
-    const Eigen::array<TensorIndex, 2> size_b(m_, k_ / 2);
+    const Eigen::array<TensorIndex, 2> size_b = {{m_, k_ / 2}};
    TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
-    const Eigen::array<TensorIndex, 2> strides(1, 2);
+    const Eigen::array<TensorIndex, 2> strides = {{1, 2}};
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
@ -137,14 +141,14 @@ template <typename Device> class BenchmarkSuite {
  }
  void broadcasting(int num_iters) {
-    const Eigen::array<TensorIndex, 2> size_a(m_, 1);
+    const Eigen::array<TensorIndex, 2> size_a = {{m_, 1}};
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
-    const Eigen::array<TensorIndex, 2> size_c(m_, n_);
+    const Eigen::array<TensorIndex, 2> size_c = {{m_, n_}};
    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, size_c);
-#if defined(__CUDACC__)
+#ifndef EIGEN_HAS_INDEX_LIST
    // nvcc doesn't support cxx11
-    const Eigen::array<int, 2> broadcast(1, n_);
+    const Eigen::array<int, 2> broadcast = {{1, n_}};
 #else
    // Take advantage of cxx11 to give the compiler information it can use to
    // optimize the code.
@ -162,7 +166,7 @@ template <typename Device> class BenchmarkSuite {
  void coeffWiseOp(int num_iters) {
    eigen_assert(m_ == k_ && k_ == n_);
-    const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+    const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
@ -178,7 +182,7 @@ template <typename Device> class BenchmarkSuite {
  void algebraicFunc(int num_iters) {
    eigen_assert(m_ == k_ && k_ == n_);
-    const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+    const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
@ -194,7 +198,7 @@ template <typename Device> class BenchmarkSuite {
  void transcendentalFunc(int num_iters) {
    eigen_assert(m_ == k_ && k_ == n_);
-    const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+    const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
@ -210,12 +214,12 @@ template <typename Device> class BenchmarkSuite {
  // Simple reduction
  void reduction(int num_iters) {
-    const Eigen::array<TensorIndex, 2> input_size(k_, n_);
+    const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, input_size);
-    const Eigen::array<TensorIndex, 1> output_size(n_);
+    const Eigen::array<TensorIndex, 1> output_size = {{n_}};
    TensorMap<Tensor<float, 1>, Eigen::Aligned> C(c_, output_size);
-    const Eigen::array<TensorIndex, 1> sum_along_dim(0);
+    const Eigen::array<TensorIndex, 1> sum_along_dim = {{0}};
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
@ -228,16 +232,16 @@ template <typename Device> class BenchmarkSuite {
  // do a contraction which is equivalent to a matrix multiplication
  void contraction(int num_iters) {
-    const Eigen::array<TensorIndex, 2> sizeA(m_, k_);
+    const Eigen::array<TensorIndex, 2> sizeA = {{m_, k_}};
-    const Eigen::array<TensorIndex, 2> sizeB(k_, n_);
+    const Eigen::array<TensorIndex, 2> sizeB = {{k_, n_}};
-    const Eigen::array<TensorIndex, 2> sizeC(m_, n_);
+    const Eigen::array<TensorIndex, 2> sizeC = {{m_, n_}};
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizeA);
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizeB);
    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizeC);
    typedef typename Tensor<float, 2>::DimensionPair DimPair;
-    const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+    const Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
@ -249,14 +253,14 @@ template <typename Device> class BenchmarkSuite {
  }
  void convolution(int num_iters, int kernel_x, int kernel_y) {
-    const Eigen::array<TensorIndex, 2> input_sizes(m_, n_);
+    const Eigen::array<TensorIndex, 2> input_sizes = {{m_, n_}};
    TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, input_sizes);
-    const Eigen::array<TensorIndex, 2> kernel_sizes(kernel_x, kernel_y);
+    const Eigen::array<TensorIndex, 2> kernel_sizes = {{kernel_x, kernel_y}};
    TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, kernel_sizes);
-    const Eigen::array<TensorIndex, 2> result_sizes(
+    const Eigen::array<TensorIndex, 2> result_sizes =
-        m_ - kernel_x + 1, n_ - kernel_y + 1);
+        {{m_ - kernel_x + 1, n_ - kernel_y + 1}};
    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, result_sizes);
-    Eigen::array<Tensor<float, 2>::Index, 2> dims(0, 1);
+    Eigen::array<Tensor<float, 2>::Index, 2> dims = {{0, 1}};
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
@ -280,7 +284,7 @@ template <typename Device> class BenchmarkSuite {
    device_.memset(b_, 23, k_ * n_ * sizeof(float));
    device_.memset(c_, 31, m_ * n_ * sizeof(float));
-    BenchmarkUseRealTime();
+    //BenchmarkUseRealTime();
  }
  inline void finalizeBenchmark(int64 num_items) {
@ -290,13 +294,13 @@ template <typename Device> class BenchmarkSuite {
    }
 #endif
    StopBenchmarkTiming();
-    SetBenchmarkItemsProcessed(num_items);
+    SetBenchmarkBytesProcessed(num_items);
  }
-  size_t m_;
+  TensorIndex m_;
-  size_t k_;
+  TensorIndex k_;
-  size_t n_;
+  TensorIndex n_;
  float* a_;
  float* b_;
  float* c_;
--- a/bench/tensors/tensor_benchmarks_cpu.cc
+++ b/bench/tensors/tensor_benchmarks_cpu.cc
@ -1,19 +1,12 @@
 #define EIGEN_USE_THREADS
-#include "base/sysinfo.h"
+#include <string>
-#include "strings/strcat.h"
+
-#include "third_party/eigen3/tensor_benchmarks.h"
+#include "tensor_benchmarks.h"
 #include "thread/threadpool.h"
 #ifdef __ANDROID__
 #define CREATE_THREAD_POOL(threads)             \
-Eigen::ThreadPoolDevice device(threads);
+Eigen::ThreadPool pool(threads);                \
-#else
+Eigen::ThreadPoolDevice device(&pool, threads);
 #define CREATE_THREAD_POOL(threads)             \
 ThreadPool tp(threads);                         \
 tp.StartWorkers();                              \
 Eigen::ThreadPoolDevice device(&tp, threads);
 #endif
 // Simple functions
 #define BM_FuncCPU(FUNC, THREADS)                                \
@ -22,7 +15,6 @@ Eigen::ThreadPoolDevice device(&tp, threads);
    CREATE_THREAD_POOL(THREADS);                                 \
    BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N);    \
    suite.FUNC(iters);                                           \
    SetBenchmarkLabel(StrCat("using ", THREADS, " threads"));    \
  }                                                              \
  BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000);
@ -84,7 +76,6 @@ BM_FuncCPU(reduction, 12);
      BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, D1, D2, D3);       \
      suite.FUNC(iters);                                                       \
    }                                                                          \
    SetBenchmarkLabel(StrCat("using ", THREADS, " threads"));                  \
  }                                                                            \
  BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000);
@ -127,7 +118,6 @@ BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16);
    CREATE_THREAD_POOL(THREADS);                                               \
    BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N);                  \
    suite.FUNC(iters, DIM1, DIM2);                                             \
    SetBenchmarkLabel(StrCat("using ", THREADS, " threads"));                  \
  }                                                                            \
  BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000);
--- a/bench/tensors/tensor_benchmarks_gpu.cu
+++ b/bench/tensors/tensor_benchmarks_gpu.cu
@ -3,22 +3,18 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <iostream>
 #include "strings/strcat.h"
 #include "third_party/eigen3/tensor_benchmarks.h"
 #include "tensor_benchmarks.h"
 // Simple functions
 #define BM_FuncGPU(FUNC)                                                       \
  static void BM_##FUNC(int iters, int N) {                                    \
    StopBenchmarkTiming();                                                     \
-    cudaStream_t stream;                                                       \
+    Eigen::CudaStreamDevice stream;                                            \
    cudaStreamCreate(&stream);                                                 \
    Eigen::GpuDevice device(&stream);                                          \
    BenchmarkSuite<Eigen::GpuDevice> suite(device, N);                         \
    cudaDeviceSynchronize();                                                   \
    suite.FUNC(iters);                                                         \
    cudaStreamDestroy(stream);                                                 \
  }                                                                            \
  BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
@ -37,13 +33,11 @@ BM_FuncGPU(reduction);
 #define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3)                              \
  static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) {               \
    StopBenchmarkTiming();                                                     \
-    cudaStream_t stream;                                                       \
+    Eigen::CudaStreamDevice stream;                                            \
    cudaStreamCreate(&stream);                                                 \
    Eigen::GpuDevice device(&stream);                                          \
    BenchmarkSuite<Eigen::GpuDevice> suite(device, D1, D2, D3);                \
    cudaDeviceSynchronize();                                                   \
    suite.FUNC(iters);                                                         \
    cudaStreamDestroy(stream);                                                 \
  }                                                                            \
  BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
@ -57,13 +51,11 @@ BM_FuncWithInputDimsGPU(contraction, N, 64, N);
 #define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2)                             \
  static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) {                  \
    StopBenchmarkTiming();                                                     \
-    cudaStream_t stream;                                                       \
+    Eigen::CudaStreamDevice stream;                                            \
    cudaStreamCreate(&stream);                                                 \
    Eigen::GpuDevice device(&stream);                                          \
    BenchmarkSuite<Eigen::GpuDevice> suite(device, N);                         \
    cudaDeviceSynchronize();                                                   \
    suite.FUNC(iters, DIM1, DIM2);                                             \
    cudaStreamDestroy(stream);                                                 \
  }                                                                            \
  BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);