mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-02-11 18:00:51 +08:00
Merged in jiayq/eigen (pull request PR-159)
Modifications to the tensor benchmarks to allow compilation in a standalone fashion.
This commit is contained in:
commit
12f8bd12a2
50
bench/tensors/benchmark.h
Normal file
50
bench/tensors/benchmark.h
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2012 The Android Open Source Project
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace testing {
|
||||||
|
class Benchmark {
|
||||||
|
public:
|
||||||
|
Benchmark(const char* name, void (*fn)(int)) {
|
||||||
|
Register(name, fn, NULL);
|
||||||
|
}
|
||||||
|
Benchmark(const char* name, void (*fn_range)(int, int)) {
|
||||||
|
Register(name, NULL, fn_range);
|
||||||
|
}
|
||||||
|
Benchmark* Arg(int x);
|
||||||
|
Benchmark* Range(int lo, int hi);
|
||||||
|
const char* Name();
|
||||||
|
bool ShouldRun(int argc, char* argv[]);
|
||||||
|
void Run();
|
||||||
|
private:
|
||||||
|
const char* name_;
|
||||||
|
void (*fn_)(int);
|
||||||
|
void (*fn_range_)(int, int);
|
||||||
|
std::vector<int> args_;
|
||||||
|
void Register(const char* name, void (*fn)(int), void (*fn_range)(int, int));
|
||||||
|
void RunRepeatedlyWithArg(int iterations, int arg);
|
||||||
|
void RunWithArg(int arg);
|
||||||
|
};
|
||||||
|
} // namespace testing
|
||||||
|
void SetBenchmarkBytesProcessed(int64_t);
|
||||||
|
void StopBenchmarkTiming();
|
||||||
|
void StartBenchmarkTiming();
|
||||||
|
#define BENCHMARK(f) \
|
||||||
|
static ::testing::Benchmark* _benchmark_##f __attribute__((unused)) = \
|
||||||
|
(new ::testing::Benchmark(#f, f))
|
||||||
|
|
222
bench/tensors/benchmark_main.cc
Normal file
222
bench/tensors/benchmark_main.cc
Normal file
@ -0,0 +1,222 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2012 The Android Open Source Project
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
#include "benchmark.h"
|
||||||
|
#include <regex.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <string>
|
||||||
|
#include <inttypes.h>
|
||||||
|
#include <time.h>
|
||||||
|
#include <map>
|
||||||
|
|
||||||
|
static int64_t g_bytes_processed;
|
||||||
|
static int64_t g_benchmark_total_time_ns;
|
||||||
|
static int64_t g_benchmark_start_time_ns;
|
||||||
|
typedef std::map<std::string, ::testing::Benchmark*> BenchmarkMap;
|
||||||
|
typedef BenchmarkMap::iterator BenchmarkMapIt;
|
||||||
|
|
||||||
|
BenchmarkMap& gBenchmarks() {
|
||||||
|
static BenchmarkMap g_benchmarks;
|
||||||
|
return g_benchmarks;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int g_name_column_width = 20;
|
||||||
|
|
||||||
|
static int Round(int n) {
|
||||||
|
int base = 1;
|
||||||
|
while (base*10 < n) {
|
||||||
|
base *= 10;
|
||||||
|
}
|
||||||
|
if (n < 2*base) {
|
||||||
|
return 2*base;
|
||||||
|
}
|
||||||
|
if (n < 5*base) {
|
||||||
|
return 5*base;
|
||||||
|
}
|
||||||
|
return 10*base;
|
||||||
|
}
|
||||||
|
static int64_t NanoTime() {
|
||||||
|
struct timespec t;
|
||||||
|
t.tv_sec = t.tv_nsec = 0;
|
||||||
|
clock_gettime(CLOCK_MONOTONIC, &t);
|
||||||
|
return static_cast<int64_t>(t.tv_sec) * 1000000000LL + t.tv_nsec;
|
||||||
|
}
|
||||||
|
namespace testing {
|
||||||
|
Benchmark* Benchmark::Arg(int arg) {
|
||||||
|
args_.push_back(arg);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
Benchmark* Benchmark::Range(int lo, int hi) {
|
||||||
|
const int kRangeMultiplier = 8;
|
||||||
|
if (hi < lo) {
|
||||||
|
int temp = hi;
|
||||||
|
hi = lo;
|
||||||
|
lo = temp;
|
||||||
|
}
|
||||||
|
while (lo < hi) {
|
||||||
|
args_.push_back(lo);
|
||||||
|
lo *= kRangeMultiplier;
|
||||||
|
}
|
||||||
|
// We always run the hi number.
|
||||||
|
args_.push_back(hi);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* Benchmark::Name() {
|
||||||
|
return name_;
|
||||||
|
}
|
||||||
|
bool Benchmark::ShouldRun(int argc, char* argv[]) {
|
||||||
|
if (argc == 1) {
|
||||||
|
return true; // With no arguments, we run all benchmarks.
|
||||||
|
}
|
||||||
|
// Otherwise, we interpret each argument as a regular expression and
|
||||||
|
// see if any of our benchmarks match.
|
||||||
|
for (int i = 1; i < argc; i++) {
|
||||||
|
regex_t re;
|
||||||
|
if (regcomp(&re, argv[i], 0) != 0) {
|
||||||
|
fprintf(stderr, "couldn't compile \"%s\" as a regular expression!\n", argv[i]);
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
int match = regexec(&re, name_, 0, NULL, 0);
|
||||||
|
regfree(&re);
|
||||||
|
if (match != REG_NOMATCH) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
void Benchmark::Register(const char* name, void (*fn)(int), void (*fn_range)(int, int)) {
|
||||||
|
name_ = name;
|
||||||
|
fn_ = fn;
|
||||||
|
fn_range_ = fn_range;
|
||||||
|
if (fn_ == NULL && fn_range_ == NULL) {
|
||||||
|
fprintf(stderr, "%s: missing function\n", name_);
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
gBenchmarks().insert(std::make_pair(name, this));
|
||||||
|
}
|
||||||
|
void Benchmark::Run() {
|
||||||
|
if (fn_ != NULL) {
|
||||||
|
RunWithArg(0);
|
||||||
|
} else {
|
||||||
|
if (args_.empty()) {
|
||||||
|
fprintf(stderr, "%s: no args!\n", name_);
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < args_.size(); ++i) {
|
||||||
|
RunWithArg(args_[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void Benchmark::RunRepeatedlyWithArg(int iterations, int arg) {
|
||||||
|
g_bytes_processed = 0;
|
||||||
|
g_benchmark_total_time_ns = 0;
|
||||||
|
g_benchmark_start_time_ns = NanoTime();
|
||||||
|
if (fn_ != NULL) {
|
||||||
|
fn_(iterations);
|
||||||
|
} else {
|
||||||
|
fn_range_(iterations, arg);
|
||||||
|
}
|
||||||
|
if (g_benchmark_start_time_ns != 0) {
|
||||||
|
g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void Benchmark::RunWithArg(int arg) {
|
||||||
|
// run once in case it's expensive
|
||||||
|
int iterations = 1;
|
||||||
|
RunRepeatedlyWithArg(iterations, arg);
|
||||||
|
while (g_benchmark_total_time_ns < 1e9 && iterations < 1e9) {
|
||||||
|
int last = iterations;
|
||||||
|
if (g_benchmark_total_time_ns/iterations == 0) {
|
||||||
|
iterations = 1e9;
|
||||||
|
} else {
|
||||||
|
iterations = 1e9 / (g_benchmark_total_time_ns/iterations);
|
||||||
|
}
|
||||||
|
iterations = std::max(last + 1, std::min(iterations + iterations/2, 100*last));
|
||||||
|
iterations = Round(iterations);
|
||||||
|
RunRepeatedlyWithArg(iterations, arg);
|
||||||
|
}
|
||||||
|
char throughput[100];
|
||||||
|
throughput[0] = '\0';
|
||||||
|
if (g_benchmark_total_time_ns > 0 && g_bytes_processed > 0) {
|
||||||
|
double mib_processed = static_cast<double>(g_bytes_processed)/1e6;
|
||||||
|
double seconds = static_cast<double>(g_benchmark_total_time_ns)/1e9;
|
||||||
|
snprintf(throughput, sizeof(throughput), " %8.2f MiB/s", mib_processed/seconds);
|
||||||
|
}
|
||||||
|
char full_name[100];
|
||||||
|
if (fn_range_ != NULL) {
|
||||||
|
if (arg >= (1<<20)) {
|
||||||
|
snprintf(full_name, sizeof(full_name), "%s/%dM", name_, arg/(1<<20));
|
||||||
|
} else if (arg >= (1<<10)) {
|
||||||
|
snprintf(full_name, sizeof(full_name), "%s/%dK", name_, arg/(1<<10));
|
||||||
|
} else {
|
||||||
|
snprintf(full_name, sizeof(full_name), "%s/%d", name_, arg);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
snprintf(full_name, sizeof(full_name), "%s", name_);
|
||||||
|
}
|
||||||
|
printf("%-*s %10d %10" PRId64 "%s\n", g_name_column_width, full_name,
|
||||||
|
iterations, g_benchmark_total_time_ns/iterations, throughput);
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
} // namespace testing
|
||||||
|
void SetBenchmarkBytesProcessed(int64_t x) {
|
||||||
|
g_bytes_processed = x;
|
||||||
|
}
|
||||||
|
void StopBenchmarkTiming() {
|
||||||
|
if (g_benchmark_start_time_ns != 0) {
|
||||||
|
g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns;
|
||||||
|
}
|
||||||
|
g_benchmark_start_time_ns = 0;
|
||||||
|
}
|
||||||
|
void StartBenchmarkTiming() {
|
||||||
|
if (g_benchmark_start_time_ns == 0) {
|
||||||
|
g_benchmark_start_time_ns = NanoTime();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int main(int argc, char* argv[]) {
|
||||||
|
if (gBenchmarks().empty()) {
|
||||||
|
fprintf(stderr, "No benchmarks registered!\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
|
||||||
|
int name_width = static_cast<int>(strlen(it->second->Name()));
|
||||||
|
g_name_column_width = std::max(g_name_column_width, name_width);
|
||||||
|
}
|
||||||
|
bool need_header = true;
|
||||||
|
for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
|
||||||
|
::testing::Benchmark* b = it->second;
|
||||||
|
if (b->ShouldRun(argc, argv)) {
|
||||||
|
if (need_header) {
|
||||||
|
printf("%-*s %10s %10s\n", g_name_column_width, "", "iterations", "ns/op");
|
||||||
|
fflush(stdout);
|
||||||
|
need_header = false;
|
||||||
|
}
|
||||||
|
b->Run();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (need_header) {
|
||||||
|
fprintf(stderr, "No matching benchmarks!\n");
|
||||||
|
fprintf(stderr, "Available benchmarks:\n");
|
||||||
|
for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
|
||||||
|
fprintf(stderr, " %s\n", it->second->Name());
|
||||||
|
}
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
@ -4,12 +4,16 @@
|
|||||||
typedef int TensorIndex;
|
typedef int TensorIndex;
|
||||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||||
|
|
||||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
#include "unsupported/Eigen/CXX11/Tensor"
|
||||||
#include "testing/base/public/benchmark.h"
|
#include "benchmark.h"
|
||||||
|
|
||||||
|
#define BENCHMARK_RANGE(bench, lo, hi) \
|
||||||
|
BENCHMARK(bench)->Range(lo, hi)
|
||||||
|
|
||||||
using Eigen::Tensor;
|
using Eigen::Tensor;
|
||||||
using Eigen::TensorMap;
|
using Eigen::TensorMap;
|
||||||
|
|
||||||
|
typedef int64_t int64;
|
||||||
|
|
||||||
// TODO(bsteiner): also templatize on the input type since we have users
|
// TODO(bsteiner): also templatize on the input type since we have users
|
||||||
// for int8 as well as floats.
|
// for int8 as well as floats.
|
||||||
@ -43,7 +47,7 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
|
|
||||||
void random(int num_iters) {
|
void random(int num_iters) {
|
||||||
eigen_assert(m_ == k_ && k_ == n_);
|
eigen_assert(m_ == k_ && k_ == n_);
|
||||||
const Eigen::array<TensorIndex, 2> sizes(m_, m_);
|
const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
|
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
|
||||||
|
|
||||||
StartBenchmarkTiming();
|
StartBenchmarkTiming();
|
||||||
@ -56,16 +60,16 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
|
|
||||||
void slicing(int num_iters) {
|
void slicing(int num_iters) {
|
||||||
eigen_assert(m_ == k_ && k_ == n_);
|
eigen_assert(m_ == k_ && k_ == n_);
|
||||||
const Eigen::array<TensorIndex, 2> sizes(m_, m_);
|
const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
|
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
|
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
|
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
|
||||||
|
|
||||||
const Eigen::DSizes<TensorIndex, 2> quarter_sizes(Eigen::array<TensorIndex, 2>(m_/2, m_/2));
|
const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2);
|
||||||
const Eigen::DSizes<TensorIndex, 2> first_quadrant(Eigen::array<TensorIndex, 2>(0, 0));
|
const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0);
|
||||||
const Eigen::DSizes<TensorIndex, 2> second_quadrant(Eigen::array<TensorIndex, 2>(0, m_/2));
|
const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2);
|
||||||
const Eigen::DSizes<TensorIndex, 2> third_quadrant(Eigen::array<TensorIndex, 2>(m_/2, 0));
|
const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0);
|
||||||
const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(Eigen::array<TensorIndex, 2>(m_/2, m_/2));
|
const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2);
|
||||||
|
|
||||||
StartBenchmarkTiming();
|
StartBenchmarkTiming();
|
||||||
for (int iter = 0; iter < num_iters; ++iter) {
|
for (int iter = 0; iter < num_iters; ++iter) {
|
||||||
@ -85,12 +89,12 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
|
|
||||||
void shuffling(int num_iters) {
|
void shuffling(int num_iters) {
|
||||||
eigen_assert(m_ == n_);
|
eigen_assert(m_ == n_);
|
||||||
const Eigen::array<TensorIndex, 2> size_a(m_, k_);
|
const Eigen::array<TensorIndex, 2> size_a = {{m_, k_}};
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
|
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
|
||||||
const Eigen::array<TensorIndex, 2> size_b(k_, m_);
|
const Eigen::array<TensorIndex, 2> size_b = {{k_, m_}};
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
|
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
|
||||||
|
|
||||||
const Eigen::array<int, 2> shuffle(1, 0);
|
const Eigen::array<int, 2> shuffle = {{1, 0}};
|
||||||
|
|
||||||
StartBenchmarkTiming();
|
StartBenchmarkTiming();
|
||||||
for (int iter = 0; iter < num_iters; ++iter) {
|
for (int iter = 0; iter < num_iters; ++iter) {
|
||||||
@ -102,9 +106,9 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
|
|
||||||
void padding(int num_iters) {
|
void padding(int num_iters) {
|
||||||
eigen_assert(m_ == k_);
|
eigen_assert(m_ == k_);
|
||||||
const Eigen::array<TensorIndex, 2> size_a(m_, k_-3);
|
const Eigen::array<TensorIndex, 2> size_a = {{m_, k_-3}};
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
|
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
|
||||||
const Eigen::array<TensorIndex, 2> size_b(k_, m_);
|
const Eigen::array<TensorIndex, 2> size_b = {{k_, m_}};
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
|
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
|
||||||
|
|
||||||
Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
|
Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
|
||||||
@ -121,12 +125,12 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
|
|
||||||
void striding(int num_iters) {
|
void striding(int num_iters) {
|
||||||
eigen_assert(m_ == k_);
|
eigen_assert(m_ == k_);
|
||||||
const Eigen::array<TensorIndex, 2> size_a(m_, k_);
|
const Eigen::array<TensorIndex, 2> size_a = {{m_, k_}};
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
|
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
|
||||||
const Eigen::array<TensorIndex, 2> size_b(m_, k_ / 2);
|
const Eigen::array<TensorIndex, 2> size_b = {{m_, k_ / 2}};
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
|
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
|
||||||
|
|
||||||
const Eigen::array<TensorIndex, 2> strides(1, 2);
|
const Eigen::array<TensorIndex, 2> strides = {{1, 2}};
|
||||||
|
|
||||||
StartBenchmarkTiming();
|
StartBenchmarkTiming();
|
||||||
for (int iter = 0; iter < num_iters; ++iter) {
|
for (int iter = 0; iter < num_iters; ++iter) {
|
||||||
@ -137,14 +141,14 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void broadcasting(int num_iters) {
|
void broadcasting(int num_iters) {
|
||||||
const Eigen::array<TensorIndex, 2> size_a(m_, 1);
|
const Eigen::array<TensorIndex, 2> size_a = {{m_, 1}};
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
|
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
|
||||||
const Eigen::array<TensorIndex, 2> size_c(m_, n_);
|
const Eigen::array<TensorIndex, 2> size_c = {{m_, n_}};
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, size_c);
|
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, size_c);
|
||||||
|
|
||||||
#if defined(__CUDACC__)
|
#ifndef EIGEN_HAS_INDEX_LIST
|
||||||
// nvcc doesn't support cxx11
|
// nvcc doesn't support cxx11
|
||||||
const Eigen::array<int, 2> broadcast(1, n_);
|
const Eigen::array<int, 2> broadcast = {{1, n_}};
|
||||||
#else
|
#else
|
||||||
// Take advantage of cxx11 to give the compiler information it can use to
|
// Take advantage of cxx11 to give the compiler information it can use to
|
||||||
// optimize the code.
|
// optimize the code.
|
||||||
@ -162,7 +166,7 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
|
|
||||||
void coeffWiseOp(int num_iters) {
|
void coeffWiseOp(int num_iters) {
|
||||||
eigen_assert(m_ == k_ && k_ == n_);
|
eigen_assert(m_ == k_ && k_ == n_);
|
||||||
const Eigen::array<TensorIndex, 2> sizes(m_, m_);
|
const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
|
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
|
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
|
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
|
||||||
@ -178,7 +182,7 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
|
|
||||||
void algebraicFunc(int num_iters) {
|
void algebraicFunc(int num_iters) {
|
||||||
eigen_assert(m_ == k_ && k_ == n_);
|
eigen_assert(m_ == k_ && k_ == n_);
|
||||||
const Eigen::array<TensorIndex, 2> sizes(m_, m_);
|
const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
|
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
|
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
|
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
|
||||||
@ -194,7 +198,7 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
|
|
||||||
void transcendentalFunc(int num_iters) {
|
void transcendentalFunc(int num_iters) {
|
||||||
eigen_assert(m_ == k_ && k_ == n_);
|
eigen_assert(m_ == k_ && k_ == n_);
|
||||||
const Eigen::array<TensorIndex, 2> sizes(m_, m_);
|
const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
|
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
|
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
|
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
|
||||||
@ -210,12 +214,12 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
|
|
||||||
// Simple reduction
|
// Simple reduction
|
||||||
void reduction(int num_iters) {
|
void reduction(int num_iters) {
|
||||||
const Eigen::array<TensorIndex, 2> input_size(k_, n_);
|
const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, input_size);
|
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, input_size);
|
||||||
const Eigen::array<TensorIndex, 1> output_size(n_);
|
const Eigen::array<TensorIndex, 1> output_size = {{n_}};
|
||||||
TensorMap<Tensor<float, 1>, Eigen::Aligned> C(c_, output_size);
|
TensorMap<Tensor<float, 1>, Eigen::Aligned> C(c_, output_size);
|
||||||
|
|
||||||
const Eigen::array<TensorIndex, 1> sum_along_dim(0);
|
const Eigen::array<TensorIndex, 1> sum_along_dim = {{0}};
|
||||||
|
|
||||||
StartBenchmarkTiming();
|
StartBenchmarkTiming();
|
||||||
for (int iter = 0; iter < num_iters; ++iter) {
|
for (int iter = 0; iter < num_iters; ++iter) {
|
||||||
@ -228,16 +232,16 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
|
|
||||||
// do a contraction which is equivalent to a matrix multiplication
|
// do a contraction which is equivalent to a matrix multiplication
|
||||||
void contraction(int num_iters) {
|
void contraction(int num_iters) {
|
||||||
const Eigen::array<TensorIndex, 2> sizeA(m_, k_);
|
const Eigen::array<TensorIndex, 2> sizeA = {{m_, k_}};
|
||||||
const Eigen::array<TensorIndex, 2> sizeB(k_, n_);
|
const Eigen::array<TensorIndex, 2> sizeB = {{k_, n_}};
|
||||||
const Eigen::array<TensorIndex, 2> sizeC(m_, n_);
|
const Eigen::array<TensorIndex, 2> sizeC = {{m_, n_}};
|
||||||
|
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizeA);
|
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizeA);
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizeB);
|
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizeB);
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizeC);
|
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizeC);
|
||||||
|
|
||||||
typedef typename Tensor<float, 2>::DimensionPair DimPair;
|
typedef typename Tensor<float, 2>::DimensionPair DimPair;
|
||||||
const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
|
const Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
|
||||||
|
|
||||||
StartBenchmarkTiming();
|
StartBenchmarkTiming();
|
||||||
for (int iter = 0; iter < num_iters; ++iter) {
|
for (int iter = 0; iter < num_iters; ++iter) {
|
||||||
@ -249,14 +253,14 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void convolution(int num_iters, int kernel_x, int kernel_y) {
|
void convolution(int num_iters, int kernel_x, int kernel_y) {
|
||||||
const Eigen::array<TensorIndex, 2> input_sizes(m_, n_);
|
const Eigen::array<TensorIndex, 2> input_sizes = {{m_, n_}};
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, input_sizes);
|
TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, input_sizes);
|
||||||
const Eigen::array<TensorIndex, 2> kernel_sizes(kernel_x, kernel_y);
|
const Eigen::array<TensorIndex, 2> kernel_sizes = {{kernel_x, kernel_y}};
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, kernel_sizes);
|
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, kernel_sizes);
|
||||||
const Eigen::array<TensorIndex, 2> result_sizes(
|
const Eigen::array<TensorIndex, 2> result_sizes =
|
||||||
m_ - kernel_x + 1, n_ - kernel_y + 1);
|
{{m_ - kernel_x + 1, n_ - kernel_y + 1}};
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, result_sizes);
|
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, result_sizes);
|
||||||
Eigen::array<Tensor<float, 2>::Index, 2> dims(0, 1);
|
Eigen::array<Tensor<float, 2>::Index, 2> dims = {{0, 1}};
|
||||||
|
|
||||||
StartBenchmarkTiming();
|
StartBenchmarkTiming();
|
||||||
for (int iter = 0; iter < num_iters; ++iter) {
|
for (int iter = 0; iter < num_iters; ++iter) {
|
||||||
@ -280,7 +284,7 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
device_.memset(b_, 23, k_ * n_ * sizeof(float));
|
device_.memset(b_, 23, k_ * n_ * sizeof(float));
|
||||||
device_.memset(c_, 31, m_ * n_ * sizeof(float));
|
device_.memset(c_, 31, m_ * n_ * sizeof(float));
|
||||||
|
|
||||||
BenchmarkUseRealTime();
|
//BenchmarkUseRealTime();
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void finalizeBenchmark(int64 num_items) {
|
inline void finalizeBenchmark(int64 num_items) {
|
||||||
@ -290,13 +294,13 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
StopBenchmarkTiming();
|
StopBenchmarkTiming();
|
||||||
SetBenchmarkItemsProcessed(num_items);
|
SetBenchmarkBytesProcessed(num_items);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
size_t m_;
|
TensorIndex m_;
|
||||||
size_t k_;
|
TensorIndex k_;
|
||||||
size_t n_;
|
TensorIndex n_;
|
||||||
float* a_;
|
float* a_;
|
||||||
float* b_;
|
float* b_;
|
||||||
float* c_;
|
float* c_;
|
||||||
|
@ -1,19 +1,12 @@
|
|||||||
#define EIGEN_USE_THREADS
|
#define EIGEN_USE_THREADS
|
||||||
|
|
||||||
#include "base/sysinfo.h"
|
#include <string>
|
||||||
#include "strings/strcat.h"
|
|
||||||
#include "third_party/eigen3/tensor_benchmarks.h"
|
#include "tensor_benchmarks.h"
|
||||||
#include "thread/threadpool.h"
|
|
||||||
|
|
||||||
#ifdef __ANDROID__
|
|
||||||
#define CREATE_THREAD_POOL(threads) \
|
#define CREATE_THREAD_POOL(threads) \
|
||||||
Eigen::ThreadPoolDevice device(threads);
|
Eigen::ThreadPool pool(threads); \
|
||||||
#else
|
Eigen::ThreadPoolDevice device(&pool, threads);
|
||||||
#define CREATE_THREAD_POOL(threads) \
|
|
||||||
ThreadPool tp(threads); \
|
|
||||||
tp.StartWorkers(); \
|
|
||||||
Eigen::ThreadPoolDevice device(&tp, threads);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Simple functions
|
// Simple functions
|
||||||
#define BM_FuncCPU(FUNC, THREADS) \
|
#define BM_FuncCPU(FUNC, THREADS) \
|
||||||
@ -22,7 +15,6 @@ Eigen::ThreadPoolDevice device(&tp, threads);
|
|||||||
CREATE_THREAD_POOL(THREADS); \
|
CREATE_THREAD_POOL(THREADS); \
|
||||||
BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \
|
BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \
|
||||||
suite.FUNC(iters); \
|
suite.FUNC(iters); \
|
||||||
SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \
|
|
||||||
} \
|
} \
|
||||||
BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000);
|
BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000);
|
||||||
|
|
||||||
@ -84,7 +76,6 @@ BM_FuncCPU(reduction, 12);
|
|||||||
BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, D1, D2, D3); \
|
BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, D1, D2, D3); \
|
||||||
suite.FUNC(iters); \
|
suite.FUNC(iters); \
|
||||||
} \
|
} \
|
||||||
SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \
|
|
||||||
} \
|
} \
|
||||||
BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000);
|
BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000);
|
||||||
|
|
||||||
@ -127,7 +118,6 @@ BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16);
|
|||||||
CREATE_THREAD_POOL(THREADS); \
|
CREATE_THREAD_POOL(THREADS); \
|
||||||
BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \
|
BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \
|
||||||
suite.FUNC(iters, DIM1, DIM2); \
|
suite.FUNC(iters, DIM1, DIM2); \
|
||||||
SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \
|
|
||||||
} \
|
} \
|
||||||
BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000);
|
BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000);
|
||||||
|
|
||||||
|
@ -3,22 +3,18 @@
|
|||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include "strings/strcat.h"
|
|
||||||
#include "third_party/eigen3/tensor_benchmarks.h"
|
|
||||||
|
|
||||||
|
|
||||||
|
#include "tensor_benchmarks.h"
|
||||||
|
|
||||||
// Simple functions
|
// Simple functions
|
||||||
#define BM_FuncGPU(FUNC) \
|
#define BM_FuncGPU(FUNC) \
|
||||||
static void BM_##FUNC(int iters, int N) { \
|
static void BM_##FUNC(int iters, int N) { \
|
||||||
StopBenchmarkTiming(); \
|
StopBenchmarkTiming(); \
|
||||||
cudaStream_t stream; \
|
Eigen::CudaStreamDevice stream; \
|
||||||
cudaStreamCreate(&stream); \
|
|
||||||
Eigen::GpuDevice device(&stream); \
|
Eigen::GpuDevice device(&stream); \
|
||||||
BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \
|
BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \
|
||||||
cudaDeviceSynchronize(); \
|
cudaDeviceSynchronize(); \
|
||||||
suite.FUNC(iters); \
|
suite.FUNC(iters); \
|
||||||
cudaStreamDestroy(stream); \
|
|
||||||
} \
|
} \
|
||||||
BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
|
BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
|
||||||
|
|
||||||
@ -37,13 +33,11 @@ BM_FuncGPU(reduction);
|
|||||||
#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \
|
#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \
|
||||||
static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \
|
static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \
|
||||||
StopBenchmarkTiming(); \
|
StopBenchmarkTiming(); \
|
||||||
cudaStream_t stream; \
|
Eigen::CudaStreamDevice stream; \
|
||||||
cudaStreamCreate(&stream); \
|
|
||||||
Eigen::GpuDevice device(&stream); \
|
Eigen::GpuDevice device(&stream); \
|
||||||
BenchmarkSuite<Eigen::GpuDevice> suite(device, D1, D2, D3); \
|
BenchmarkSuite<Eigen::GpuDevice> suite(device, D1, D2, D3); \
|
||||||
cudaDeviceSynchronize(); \
|
cudaDeviceSynchronize(); \
|
||||||
suite.FUNC(iters); \
|
suite.FUNC(iters); \
|
||||||
cudaStreamDestroy(stream); \
|
|
||||||
} \
|
} \
|
||||||
BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
|
BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
|
||||||
|
|
||||||
@ -57,13 +51,11 @@ BM_FuncWithInputDimsGPU(contraction, N, 64, N);
|
|||||||
#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \
|
#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \
|
||||||
static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \
|
static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \
|
||||||
StopBenchmarkTiming(); \
|
StopBenchmarkTiming(); \
|
||||||
cudaStream_t stream; \
|
Eigen::CudaStreamDevice stream; \
|
||||||
cudaStreamCreate(&stream); \
|
|
||||||
Eigen::GpuDevice device(&stream); \
|
Eigen::GpuDevice device(&stream); \
|
||||||
BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \
|
BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \
|
||||||
cudaDeviceSynchronize(); \
|
cudaDeviceSynchronize(); \
|
||||||
suite.FUNC(iters, DIM1, DIM2); \
|
suite.FUNC(iters, DIM1, DIM2); \
|
||||||
cudaStreamDestroy(stream); \
|
|
||||||
} \
|
} \
|
||||||
BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
|
BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
|
||||||
|
|
Loading…
Reference in New Issue
Block a user