mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-03-07 18:27:40 +08:00
Run two independent chains, when reducing tensors.
Running two chains exposes more instruction level parallelism, by allowing to execute both chains at the same time. Results are a bit noisy, but for medium length we almost hit theoretical upper bound of 2x. BM_fullReduction_16T/3 [using 16 threads] 17.3ns ±11% 17.4ns ± 9% ~ (p=0.178 n=18+19) BM_fullReduction_16T/4 [using 16 threads] 17.6ns ±17% 17.0ns ±18% ~ (p=0.835 n=20+19) BM_fullReduction_16T/7 [using 16 threads] 18.9ns ±12% 18.2ns ±10% ~ (p=0.756 n=20+18) BM_fullReduction_16T/8 [using 16 threads] 19.8ns ±13% 19.4ns ±21% ~ (p=0.512 n=20+20) BM_fullReduction_16T/10 [using 16 threads] 23.5ns ±15% 20.8ns ±24% -11.37% (p=0.000 n=20+19) BM_fullReduction_16T/15 [using 16 threads] 35.8ns ±21% 26.9ns ±17% -24.76% (p=0.000 n=20+19) BM_fullReduction_16T/16 [using 16 threads] 38.7ns ±22% 27.7ns ±18% -28.40% (p=0.000 n=20+19) BM_fullReduction_16T/31 [using 16 threads] 146ns ±17% 74ns ±11% -49.05% (p=0.000 n=20+18) BM_fullReduction_16T/32 [using 16 threads] 154ns ±19% 84ns ±30% -45.79% (p=0.000 n=20+19) BM_fullReduction_16T/64 [using 16 threads] 603ns ± 8% 308ns ±12% -48.94% (p=0.000 n=17+17) BM_fullReduction_16T/128 [using 16 threads] 2.44µs ±13% 1.22µs ± 1% -50.29% (p=0.000 n=17+17) BM_fullReduction_16T/256 [using 16 threads] 9.84µs ±14% 5.13µs ±30% -47.82% (p=0.000 n=19+19) BM_fullReduction_16T/512 [using 16 threads] 78.0µs ± 9% 56.1µs ±17% -28.02% (p=0.000 n=18+20) BM_fullReduction_16T/1k [using 16 threads] 325µs ± 5% 263µs ± 4% -19.00% (p=0.000 n=20+16) BM_fullReduction_16T/2k [using 16 threads] 1.09ms ± 3% 0.99ms ± 1% -9.04% (p=0.000 n=20+20) BM_fullReduction_16T/4k [using 16 threads] 7.66ms ± 3% 7.57ms ± 3% -1.24% (p=0.017 n=20+20) BM_fullReduction_16T/10k [using 16 threads] 65.3ms ± 4% 65.0ms ± 3% ~ (p=0.718 n=20+20)
This commit is contained in:
parent
a475bf14d4
commit
231ce21535
@ -242,14 +242,26 @@ struct InnerMostDimReducer<Self, Op, true, true> {
|
|||||||
}
|
}
|
||||||
return reducer.finalize(accum);
|
return reducer.finalize(accum);
|
||||||
} else {
|
} else {
|
||||||
|
const typename Self::Index UnrollSize =
|
||||||
|
(numValuesToReduce / (2*packetSize)) * 2*packetSize;
|
||||||
const typename Self::Index VectorizedSize =
|
const typename Self::Index VectorizedSize =
|
||||||
(numValuesToReduce / packetSize) * packetSize;
|
(numValuesToReduce / packetSize) * packetSize;
|
||||||
typename Self::PacketReturnType paccum =
|
typename Self::PacketReturnType paccum =
|
||||||
reducer.template initializePacket<typename Self::PacketReturnType>();
|
reducer.template initializePacket<typename Self::PacketReturnType>();
|
||||||
for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
|
typename Self::PacketReturnType paccum2 =
|
||||||
|
reducer.template initializePacket<typename Self::PacketReturnType>();
|
||||||
|
for (typename Self::Index j = 0; j < UnrollSize; j += packetSize * 2) {
|
||||||
reducer.reducePacket(
|
reducer.reducePacket(
|
||||||
self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
|
self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
|
||||||
|
reducer.reducePacket(
|
||||||
|
self.m_impl.template packet<Unaligned>(firstIndex + j + packetSize),
|
||||||
|
&paccum2);
|
||||||
}
|
}
|
||||||
|
for (typename Self::Index j = UnrollSize; j < VectorizedSize; j+= packetSize) {
|
||||||
|
reducer.reducePacket(self.m_impl.template packet<Unaligned>(
|
||||||
|
firstIndex + j), &paccum);
|
||||||
|
}
|
||||||
|
reducer.reducePacket(paccum2, &paccum);
|
||||||
for (typename Self::Index j = VectorizedSize; j < numValuesToReduce;
|
for (typename Self::Index j = VectorizedSize; j < numValuesToReduce;
|
||||||
++j) {
|
++j) {
|
||||||
reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
|
reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
|
||||||
|
Loading…
Reference in New Issue
Block a user