mirror of
git://gcc.gnu.org/git/gcc.git
synced 2025-04-08 22:31:40 +08:00
re PR libfortran/78379 (Processor-specific versions for matmul)
2016-12-03 Thomas Koenig <tkoenig@gcc.gnu.org> PR fortran/78379 * config/i386/cpuinfo.c: Move denums for processor vendors, processor type, processor subtypes and declaration of struct __processor_model into * config/i386/cpuinfo.h: New header file. * Makefile.am: Add dependence of m4/matmul_internal_m4 to mamtul files.. * Makefile.in: Regenerated. * acinclude.m4: Check for AVX, AVX2 and AVX512F. * config.h.in: Add HAVE_AVX, HAVE_AVX2 and HAVE_AVX512F. * configure: Regenerated. * configure.ac: Use checks for AVX, AVX2 and AVX_512F. * m4/matmul_internal.m4: New file. working part of matmul.m4. * m4/matmul.m4: Implement architecture-specific switching for AVX, AVX2 and AVX512F by including matmul_internal.m4 multiple times. * generated/matmul_c10.c: Regenerated. * generated/matmul_c16.c: Regenerated. * generated/matmul_c4.c: Regenerated. * generated/matmul_c8.c: Regenerated. * generated/matmul_i1.c: Regenerated. * generated/matmul_i16.c: Regenerated. * generated/matmul_i2.c: Regenerated. * generated/matmul_i4.c: Regenerated. * generated/matmul_i8.c: Regenerated. * generated/matmul_r10.c: Regenerated. * generated/matmul_r16.c: Regenerated. * generated/matmul_r4.c: Regenerated. * generated/matmul_r8.c: Regenerated. From-SVN: r243219
This commit is contained in:
parent
802583a210
commit
31cfd83286
@ -1,3 +1,11 @@
|
||||
2016-12-03 Thomas Koenig <tkoenig@gcc.gnu.org>
|
||||
|
||||
PR fortran/78379
|
||||
* config/i386/cpuinfo.c: Move denums for processor vendors,
|
||||
processor type, processor subtypes and declaration of
|
||||
struct __processor_model into
|
||||
* config/i386/cpuinfo.h: New header file.
|
||||
|
||||
2016-12-02 Andre Vieira <andre.simoesdiasvieira@arm.com>
|
||||
Thomas Preud'homme <thomas.preudhomme@arm.com>
|
||||
|
||||
|
@ -26,6 +26,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
||||
#include "cpuid.h"
|
||||
#include "tsystem.h"
|
||||
#include "auto-target.h"
|
||||
#include "cpuinfo.h"
|
||||
|
||||
#ifdef HAVE_INIT_PRIORITY
|
||||
#define CONSTRUCTOR_PRIORITY (101)
|
||||
@ -36,97 +37,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
||||
int __cpu_indicator_init (void)
|
||||
__attribute__ ((constructor CONSTRUCTOR_PRIORITY));
|
||||
|
||||
/* Processor Vendor and Models. */
|
||||
|
||||
enum processor_vendor
|
||||
{
|
||||
VENDOR_INTEL = 1,
|
||||
VENDOR_AMD,
|
||||
VENDOR_OTHER,
|
||||
VENDOR_MAX
|
||||
};
|
||||
|
||||
/* Any new types or subtypes have to be inserted at the end. */
|
||||
|
||||
enum processor_types
|
||||
{
|
||||
INTEL_BONNELL = 1,
|
||||
INTEL_CORE2,
|
||||
INTEL_COREI7,
|
||||
AMDFAM10H,
|
||||
AMDFAM15H,
|
||||
INTEL_SILVERMONT,
|
||||
INTEL_KNL,
|
||||
AMD_BTVER1,
|
||||
AMD_BTVER2,
|
||||
AMDFAM17H,
|
||||
CPU_TYPE_MAX
|
||||
};
|
||||
|
||||
enum processor_subtypes
|
||||
{
|
||||
INTEL_COREI7_NEHALEM = 1,
|
||||
INTEL_COREI7_WESTMERE,
|
||||
INTEL_COREI7_SANDYBRIDGE,
|
||||
AMDFAM10H_BARCELONA,
|
||||
AMDFAM10H_SHANGHAI,
|
||||
AMDFAM10H_ISTANBUL,
|
||||
AMDFAM15H_BDVER1,
|
||||
AMDFAM15H_BDVER2,
|
||||
AMDFAM15H_BDVER3,
|
||||
AMDFAM15H_BDVER4,
|
||||
AMDFAM17H_ZNVER1,
|
||||
INTEL_COREI7_IVYBRIDGE,
|
||||
INTEL_COREI7_HASWELL,
|
||||
INTEL_COREI7_BROADWELL,
|
||||
INTEL_COREI7_SKYLAKE,
|
||||
INTEL_COREI7_SKYLAKE_AVX512,
|
||||
CPU_SUBTYPE_MAX
|
||||
};
|
||||
|
||||
/* ISA Features supported. New features have to be inserted at the end. */
|
||||
|
||||
enum processor_features
|
||||
{
|
||||
FEATURE_CMOV = 0,
|
||||
FEATURE_MMX,
|
||||
FEATURE_POPCNT,
|
||||
FEATURE_SSE,
|
||||
FEATURE_SSE2,
|
||||
FEATURE_SSE3,
|
||||
FEATURE_SSSE3,
|
||||
FEATURE_SSE4_1,
|
||||
FEATURE_SSE4_2,
|
||||
FEATURE_AVX,
|
||||
FEATURE_AVX2,
|
||||
FEATURE_SSE4_A,
|
||||
FEATURE_FMA4,
|
||||
FEATURE_XOP,
|
||||
FEATURE_FMA,
|
||||
FEATURE_AVX512F,
|
||||
FEATURE_BMI,
|
||||
FEATURE_BMI2,
|
||||
FEATURE_AES,
|
||||
FEATURE_PCLMUL,
|
||||
FEATURE_AVX512VL,
|
||||
FEATURE_AVX512BW,
|
||||
FEATURE_AVX512DQ,
|
||||
FEATURE_AVX512CD,
|
||||
FEATURE_AVX512ER,
|
||||
FEATURE_AVX512PF,
|
||||
FEATURE_AVX512VBMI,
|
||||
FEATURE_AVX512IFMA,
|
||||
FEATURE_AVX5124VNNIW,
|
||||
FEATURE_AVX5124FMAPS
|
||||
};
|
||||
|
||||
struct __processor_model
|
||||
{
|
||||
unsigned int __cpu_vendor;
|
||||
unsigned int __cpu_type;
|
||||
unsigned int __cpu_subtype;
|
||||
unsigned int __cpu_features[1];
|
||||
} __cpu_model = { };
|
||||
struct __processor_model __cpu_model = { };
|
||||
|
||||
|
||||
/* Get the specific type of AMD CPU. */
|
||||
|
116
libgcc/config/i386/cpuinfo.h
Normal file
116
libgcc/config/i386/cpuinfo.h
Normal file
@ -0,0 +1,116 @@
|
||||
/* Get CPU type and Features for x86 processors.
|
||||
Copyright (C) 2012-2016 Free Software Foundation, Inc.
|
||||
Contributed by Sriraman Tallam (tmsriram@google.com)
|
||||
|
||||
This file is part of GCC.
|
||||
|
||||
GCC is free software; you can redistribute it and/or modify it under
|
||||
the terms of the GNU General Public License as published by the Free
|
||||
Software Foundation; either version 3, or (at your option) any later
|
||||
version.
|
||||
|
||||
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
for more details.
|
||||
|
||||
Under Section 7 of GPL version 3, you are granted additional
|
||||
permissions described in the GCC Runtime Library Exception, version
|
||||
3.1, as published by the Free Software Foundation.
|
||||
|
||||
You should have received a copy of the GNU General Public License and
|
||||
a copy of the GCC Runtime Library Exception along with this program;
|
||||
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
/* Processor Vendor and Models. */
|
||||
|
||||
enum processor_vendor
|
||||
{
|
||||
VENDOR_INTEL = 1,
|
||||
VENDOR_AMD,
|
||||
VENDOR_OTHER,
|
||||
VENDOR_MAX
|
||||
};
|
||||
|
||||
/* Any new types or subtypes have to be inserted at the end. */
|
||||
|
||||
enum processor_types
|
||||
{
|
||||
INTEL_BONNELL = 1,
|
||||
INTEL_CORE2,
|
||||
INTEL_COREI7,
|
||||
AMDFAM10H,
|
||||
AMDFAM15H,
|
||||
INTEL_SILVERMONT,
|
||||
INTEL_KNL,
|
||||
AMD_BTVER1,
|
||||
AMD_BTVER2,
|
||||
AMDFAM17H,
|
||||
CPU_TYPE_MAX
|
||||
};
|
||||
|
||||
enum processor_subtypes
|
||||
{
|
||||
INTEL_COREI7_NEHALEM = 1,
|
||||
INTEL_COREI7_WESTMERE,
|
||||
INTEL_COREI7_SANDYBRIDGE,
|
||||
AMDFAM10H_BARCELONA,
|
||||
AMDFAM10H_SHANGHAI,
|
||||
AMDFAM10H_ISTANBUL,
|
||||
AMDFAM15H_BDVER1,
|
||||
AMDFAM15H_BDVER2,
|
||||
AMDFAM15H_BDVER3,
|
||||
AMDFAM15H_BDVER4,
|
||||
AMDFAM17H_ZNVER1,
|
||||
INTEL_COREI7_IVYBRIDGE,
|
||||
INTEL_COREI7_HASWELL,
|
||||
INTEL_COREI7_BROADWELL,
|
||||
INTEL_COREI7_SKYLAKE,
|
||||
INTEL_COREI7_SKYLAKE_AVX512,
|
||||
CPU_SUBTYPE_MAX
|
||||
};
|
||||
|
||||
/* ISA Features supported. New features have to be inserted at the end. */
|
||||
|
||||
enum processor_features
|
||||
{
|
||||
FEATURE_CMOV = 0,
|
||||
FEATURE_MMX,
|
||||
FEATURE_POPCNT,
|
||||
FEATURE_SSE,
|
||||
FEATURE_SSE2,
|
||||
FEATURE_SSE3,
|
||||
FEATURE_SSSE3,
|
||||
FEATURE_SSE4_1,
|
||||
FEATURE_SSE4_2,
|
||||
FEATURE_AVX,
|
||||
FEATURE_AVX2,
|
||||
FEATURE_SSE4_A,
|
||||
FEATURE_FMA4,
|
||||
FEATURE_XOP,
|
||||
FEATURE_FMA,
|
||||
FEATURE_AVX512F,
|
||||
FEATURE_BMI,
|
||||
FEATURE_BMI2,
|
||||
FEATURE_AES,
|
||||
FEATURE_PCLMUL,
|
||||
FEATURE_AVX512VL,
|
||||
FEATURE_AVX512BW,
|
||||
FEATURE_AVX512DQ,
|
||||
FEATURE_AVX512CD,
|
||||
FEATURE_AVX512ER,
|
||||
FEATURE_AVX512PF,
|
||||
FEATURE_AVX512VBMI,
|
||||
FEATURE_AVX512IFMA,
|
||||
FEATURE_AVX5124VNNIW,
|
||||
FEATURE_AVX5124FMAPS
|
||||
};
|
||||
|
||||
extern struct __processor_model
|
||||
{
|
||||
unsigned int __cpu_vendor;
|
||||
unsigned int __cpu_type;
|
||||
unsigned int __cpu_subtype;
|
||||
unsigned int __cpu_features[1];
|
||||
} __cpu_model;
|
@ -1,3 +1,31 @@
|
||||
2016-12-03 Thomas Koenig <tkoenig@gcc.gnu.org>
|
||||
|
||||
PR fortran/78379
|
||||
* Makefile.am: Add dependence of m4/matmul_internal_m4 to
|
||||
mamtul files..
|
||||
* Makefile.in: Regenerated.
|
||||
* acinclude.m4: Check for AVX, AVX2 and AVX512F.
|
||||
* config.h.in: Add HAVE_AVX, HAVE_AVX2 and HAVE_AVX512F.
|
||||
* configure: Regenerated.
|
||||
* configure.ac: Use checks for AVX, AVX2 and AVX_512F.
|
||||
* m4/matmul_internal.m4: New file. working part of matmul.m4.
|
||||
* m4/matmul.m4: Implement architecture-specific switching
|
||||
for AVX, AVX2 and AVX512F by including matmul_internal.m4
|
||||
multiple times.
|
||||
* generated/matmul_c10.c: Regenerated.
|
||||
* generated/matmul_c16.c: Regenerated.
|
||||
* generated/matmul_c4.c: Regenerated.
|
||||
* generated/matmul_c8.c: Regenerated.
|
||||
* generated/matmul_i1.c: Regenerated.
|
||||
* generated/matmul_i16.c: Regenerated.
|
||||
* generated/matmul_i2.c: Regenerated.
|
||||
* generated/matmul_i4.c: Regenerated.
|
||||
* generated/matmul_i8.c: Regenerated.
|
||||
* generated/matmul_r10.c: Regenerated.
|
||||
* generated/matmul_r16.c: Regenerated.
|
||||
* generated/matmul_r4.c: Regenerated.
|
||||
* generated/matmul_r8.c: Regenerated.
|
||||
|
||||
2016-11-30 Andre Vehreschild <vehre@gcc.gnu.org>
|
||||
|
||||
* caf/single.c (_gfortran_caf_get_by_ref): Prevent compile time
|
||||
|
@ -987,7 +987,7 @@ $(i_product_c): m4/product.m4 $(I_M4_DEPS1)
|
||||
$(i_sum_c): m4/sum.m4 $(I_M4_DEPS1)
|
||||
$(M4) -Dfile=$@ -I$(srcdir)/m4 sum.m4 > $@
|
||||
|
||||
$(i_matmul_c): m4/matmul.m4 $(I_M4_DEPS)
|
||||
$(i_matmul_c): m4/matmul.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
|
||||
$(M4) -Dfile=$@ -I$(srcdir)/m4 matmul.m4 > $@
|
||||
|
||||
$(i_matmull_c): m4/matmull.m4 $(I_M4_DEPS)
|
||||
|
@ -6053,7 +6053,7 @@ fpu-target.inc: fpu-target.h $(srcdir)/libgfortran.h
|
||||
@MAINTAINER_MODE_TRUE@$(i_sum_c): m4/sum.m4 $(I_M4_DEPS1)
|
||||
@MAINTAINER_MODE_TRUE@ $(M4) -Dfile=$@ -I$(srcdir)/m4 sum.m4 > $@
|
||||
|
||||
@MAINTAINER_MODE_TRUE@$(i_matmul_c): m4/matmul.m4 $(I_M4_DEPS)
|
||||
@MAINTAINER_MODE_TRUE@$(i_matmul_c): m4/matmul.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
|
||||
@MAINTAINER_MODE_TRUE@ $(M4) -Dfile=$@ -I$(srcdir)/m4 matmul.m4 > $@
|
||||
|
||||
@MAINTAINER_MODE_TRUE@$(i_matmull_c): m4/matmull.m4 $(I_M4_DEPS)
|
||||
|
@ -393,3 +393,54 @@ AC_DEFUN([LIBGFOR_CHECK_STRERROR_R], [
|
||||
[Define if strerror_r takes two arguments and is available in <string.h>.]),)
|
||||
CFLAGS="$ac_save_CFLAGS"
|
||||
])
|
||||
|
||||
dnl Check for AVX
|
||||
|
||||
AC_DEFUN([LIBGFOR_CHECK_AVX], [
|
||||
ac_save_CFLAGS="$CFLAGS"
|
||||
CFLAGS="-O2 -mavx"
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
|
||||
void _mm256_zeroall (void)
|
||||
{
|
||||
__builtin_ia32_vzeroall ();
|
||||
}]], [[]])],
|
||||
AC_DEFINE(HAVE_AVX, 1,
|
||||
[Define if AVX instructions can be compiled.]),
|
||||
[])
|
||||
CFLAGS="$ac_save_CFLAGS"
|
||||
])
|
||||
|
||||
dnl Check for AVX2
|
||||
|
||||
AC_DEFUN([LIBGFOR_CHECK_AVX2], [
|
||||
ac_save_CFLAGS="$CFLAGS"
|
||||
CFLAGS="-O2 -mavx2"
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
|
||||
typedef long long __v4di __attribute__ ((__vector_size__ (32)));
|
||||
__v4di
|
||||
mm256_is32_andnotsi256 (__v4di __X, __v4di __Y)
|
||||
{
|
||||
return __builtin_ia32_andnotsi256 (__X, __Y);
|
||||
}]], [[]])],
|
||||
AC_DEFINE(HAVE_AVX2, 1,
|
||||
[Define if AVX2 instructions can be compiled.]),
|
||||
[])
|
||||
CFLAGS="$ac_save_CFLAGS"
|
||||
])
|
||||
|
||||
dnl Check for AVX512f
|
||||
|
||||
AC_DEFUN([LIBGFOR_CHECK_AVX512F], [
|
||||
ac_save_CFLAGS="$CFLAGS"
|
||||
CFLAGS="-O2 -mavx512f"
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
|
||||
typedef double __m512d __attribute__ ((__vector_size__ (64)));
|
||||
__m512d _mm512_add (__m512d a)
|
||||
{
|
||||
return __builtin_ia32_addpd512_mask (a, a, a, 1, 4);
|
||||
}]], [[]])],
|
||||
AC_DEFINE(HAVE_AVX512F, 1,
|
||||
[Define if AVX512f instructions can be compiled.]),
|
||||
[])
|
||||
CFLAGS="$ac_save_CFLAGS"
|
||||
])
|
||||
|
@ -78,6 +78,15 @@
|
||||
/* Define to 1 if the target supports __attribute__((visibility(...))). */
|
||||
#undef HAVE_ATTRIBUTE_VISIBILITY
|
||||
|
||||
/* Define if AVX instructions can be compiled. */
|
||||
#undef HAVE_AVX
|
||||
|
||||
/* Define if AVX2 instructions can be compiled. */
|
||||
#undef HAVE_AVX2
|
||||
|
||||
/* Define if AVX512f instructions can be compiled. */
|
||||
#undef HAVE_AVX512F
|
||||
|
||||
/* Define to 1 if you have the `cabs' function. */
|
||||
#undef HAVE_CABS
|
||||
|
||||
|
87
libgfortran/configure
vendored
87
libgfortran/configure
vendored
@ -26174,6 +26174,93 @@ $as_echo "#define HAVE_CRLF 1" >>confdefs.h
|
||||
|
||||
fi
|
||||
|
||||
# Check whether we support AVX extensions
|
||||
|
||||
ac_save_CFLAGS="$CFLAGS"
|
||||
CFLAGS="-O2 -mavx"
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
|
||||
void _mm256_zeroall (void)
|
||||
{
|
||||
__builtin_ia32_vzeroall ();
|
||||
}
|
||||
int
|
||||
main ()
|
||||
{
|
||||
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_c_try_compile "$LINENO"; then :
|
||||
|
||||
$as_echo "#define HAVE_AVX 1" >>confdefs.h
|
||||
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
|
||||
CFLAGS="$ac_save_CFLAGS"
|
||||
|
||||
|
||||
# Check wether we support AVX2 extensions
|
||||
|
||||
ac_save_CFLAGS="$CFLAGS"
|
||||
CFLAGS="-O2 -mavx2"
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
|
||||
typedef long long __v4di __attribute__ ((__vector_size__ (32)));
|
||||
__v4di
|
||||
mm256_is32_andnotsi256 (__v4di __X, __v4di __Y)
|
||||
{
|
||||
return __builtin_ia32_andnotsi256 (__X, __Y);
|
||||
}
|
||||
int
|
||||
main ()
|
||||
{
|
||||
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_c_try_compile "$LINENO"; then :
|
||||
|
||||
$as_echo "#define HAVE_AVX2 1" >>confdefs.h
|
||||
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
|
||||
CFLAGS="$ac_save_CFLAGS"
|
||||
|
||||
|
||||
# Check wether we support AVX512f extensions
|
||||
|
||||
ac_save_CFLAGS="$CFLAGS"
|
||||
CFLAGS="-O2 -mavx512f"
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
|
||||
typedef double __m512d __attribute__ ((__vector_size__ (64)));
|
||||
__m512d _mm512_add (__m512d a)
|
||||
{
|
||||
return __builtin_ia32_addpd512_mask (a, a, a, 1, 4);
|
||||
}
|
||||
int
|
||||
main ()
|
||||
{
|
||||
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_c_try_compile "$LINENO"; then :
|
||||
|
||||
$as_echo "#define HAVE_AVX512F 1" >>confdefs.h
|
||||
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
|
||||
CFLAGS="$ac_save_CFLAGS"
|
||||
|
||||
|
||||
cat >confcache <<\_ACEOF
|
||||
# This file is a shell script that caches the results of configure
|
||||
# tests run on this system so they can be shared between configure
|
||||
|
@ -609,6 +609,15 @@ LIBGFOR_CHECK_UNLINK_OPEN_FILE
|
||||
# Check whether line terminator is LF or CRLF
|
||||
LIBGFOR_CHECK_CRLF
|
||||
|
||||
# Check whether we support AVX extensions
|
||||
LIBGFOR_CHECK_AVX
|
||||
|
||||
# Check wether we support AVX2 extensions
|
||||
LIBGFOR_CHECK_AVX2
|
||||
|
||||
# Check wether we support AVX512f extensions
|
||||
LIBGFOR_CHECK_AVX512F
|
||||
|
||||
AC_CACHE_SAVE
|
||||
|
||||
if test ${multilib} = yes; then
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -76,537 +76,105 @@ extern void matmul_'rtype_code` ('rtype` * const restrict retarray,
|
||||
int blas_limit, blas_call gemm);
|
||||
export_proto(matmul_'rtype_code`);
|
||||
|
||||
void
|
||||
matmul_'rtype_code` ('rtype` * const restrict retarray,
|
||||
'ifelse(rtype_letter,`r',dnl
|
||||
`#if defined(HAVE_AVX) && defined(HAVE_AVX2)
|
||||
/* REAL types generate identical code for AVX and AVX2. Only generate
|
||||
an AVX2 function if we are dealing with integer. */
|
||||
#undef HAVE_AVX2
|
||||
#endif')
|
||||
`
|
||||
|
||||
/* Put exhaustive list of possible architectures here here, ORed together. */
|
||||
|
||||
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
|
||||
|
||||
#ifdef HAVE_AVX
|
||||
'define(`matmul_name',`matmul_'rtype_code`_avx')dnl
|
||||
`static void
|
||||
'matmul_name` ('rtype` * const restrict retarray,
|
||||
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
|
||||
static' include(matmul_internal.m4)dnl
|
||||
`#endif /* HAVE_AVX */
|
||||
|
||||
#ifdef HAVE_AVX2
|
||||
'define(`matmul_name',`matmul_'rtype_code`_avx2')dnl
|
||||
`static void
|
||||
'matmul_name` ('rtype` * const restrict retarray,
|
||||
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
|
||||
static' include(matmul_internal.m4)dnl
|
||||
`#endif /* HAVE_AVX2 */
|
||||
|
||||
#ifdef HAVE_AVX512F
|
||||
'define(`matmul_name',`matmul_'rtype_code`_avx512f')dnl
|
||||
`static void
|
||||
'matmul_name` ('rtype` * const restrict retarray,
|
||||
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
|
||||
static' include(matmul_internal.m4)dnl
|
||||
`#endif /* HAVE_AVX512F */
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
'define(`matmul_name',`matmul_'rtype_code`_vanilla')dnl
|
||||
`static' include(matmul_internal.m4)dnl
|
||||
|
||||
`/* Compiling main function, with selection code for the processor. */
|
||||
|
||||
/* Currently, this is i386 only. Adjust for other architectures. */
|
||||
|
||||
#include <config/i386/cpuinfo.h>
|
||||
void matmul_'rtype_code` ('rtype` * const restrict retarray,
|
||||
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm)
|
||||
{
|
||||
const 'rtype_name` * restrict abase;
|
||||
const 'rtype_name` * restrict bbase;
|
||||
'rtype_name` * restrict dest;
|
||||
static void (*matmul_p) ('rtype` * const restrict retarray,
|
||||
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) = NULL;
|
||||
|
||||
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
|
||||
index_type x, y, n, count, xcount, ycount;
|
||||
|
||||
assert (GFC_DESCRIPTOR_RANK (a) == 2
|
||||
|| GFC_DESCRIPTOR_RANK (b) == 2);
|
||||
|
||||
/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
|
||||
|
||||
Either A or B (but not both) can be rank 1:
|
||||
|
||||
o One-dimensional argument A is implicitly treated as a row matrix
|
||||
dimensioned [1,count], so xcount=1.
|
||||
|
||||
o One-dimensional argument B is implicitly treated as a column matrix
|
||||
dimensioned [count, 1], so ycount=1.
|
||||
*/
|
||||
|
||||
if (retarray->base_addr == NULL)
|
||||
if (matmul_p == NULL)
|
||||
{
|
||||
if (GFC_DESCRIPTOR_RANK (a) == 1)
|
||||
{
|
||||
GFC_DIMENSION_SET(retarray->dim[0], 0,
|
||||
GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
|
||||
}
|
||||
else if (GFC_DESCRIPTOR_RANK (b) == 1)
|
||||
{
|
||||
GFC_DIMENSION_SET(retarray->dim[0], 0,
|
||||
GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
GFC_DIMENSION_SET(retarray->dim[0], 0,
|
||||
GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
|
||||
|
||||
GFC_DIMENSION_SET(retarray->dim[1], 0,
|
||||
GFC_DESCRIPTOR_EXTENT(b,1) - 1,
|
||||
GFC_DESCRIPTOR_EXTENT(retarray,0));
|
||||
}
|
||||
|
||||
retarray->base_addr
|
||||
= xmallocarray (size0 ((array_t *) retarray), sizeof ('rtype_name`));
|
||||
retarray->offset = 0;
|
||||
}
|
||||
else if (unlikely (compile_options.bounds_check))
|
||||
{
|
||||
index_type ret_extent, arg_extent;
|
||||
|
||||
if (GFC_DESCRIPTOR_RANK (a) == 1)
|
||||
matmul_p = matmul_'rtype_code`_vanilla;
|
||||
if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
|
||||
{
|
||||
arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
|
||||
ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
||||
if (arg_extent != ret_extent)
|
||||
runtime_error ("Incorrect extent in return array in"
|
||||
" MATMUL intrinsic: is %ld, should be %ld",
|
||||
(long int) ret_extent, (long int) arg_extent);
|
||||
}
|
||||
else if (GFC_DESCRIPTOR_RANK (b) == 1)
|
||||
{
|
||||
arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
|
||||
ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
||||
if (arg_extent != ret_extent)
|
||||
runtime_error ("Incorrect extent in return array in"
|
||||
" MATMUL intrinsic: is %ld, should be %ld",
|
||||
(long int) ret_extent, (long int) arg_extent);
|
||||
}
|
||||
else
|
||||
{
|
||||
arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
|
||||
ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
||||
if (arg_extent != ret_extent)
|
||||
runtime_error ("Incorrect extent in return array in"
|
||||
" MATMUL intrinsic for dimension 1:"
|
||||
" is %ld, should be %ld",
|
||||
(long int) ret_extent, (long int) arg_extent);
|
||||
|
||||
arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
|
||||
ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
|
||||
if (arg_extent != ret_extent)
|
||||
runtime_error ("Incorrect extent in return array in"
|
||||
" MATMUL intrinsic for dimension 2:"
|
||||
" is %ld, should be %ld",
|
||||
(long int) ret_extent, (long int) arg_extent);
|
||||
}
|
||||
}
|
||||
'
|
||||
sinclude(`matmul_asm_'rtype_code`.m4')dnl
|
||||
`
|
||||
if (GFC_DESCRIPTOR_RANK (retarray) == 1)
|
||||
{
|
||||
/* One-dimensional result may be addressed in the code below
|
||||
either as a row or a column matrix. We want both cases to
|
||||
work. */
|
||||
rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
|
||||
}
|
||||
else
|
||||
{
|
||||
rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
|
||||
rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
|
||||
}
|
||||
|
||||
|
||||
if (GFC_DESCRIPTOR_RANK (a) == 1)
|
||||
{
|
||||
/* Treat it as a a row matrix A[1,count]. */
|
||||
axstride = GFC_DESCRIPTOR_STRIDE(a,0);
|
||||
aystride = 1;
|
||||
|
||||
xcount = 1;
|
||||
count = GFC_DESCRIPTOR_EXTENT(a,0);
|
||||
}
|
||||
else
|
||||
{
|
||||
axstride = GFC_DESCRIPTOR_STRIDE(a,0);
|
||||
aystride = GFC_DESCRIPTOR_STRIDE(a,1);
|
||||
|
||||
count = GFC_DESCRIPTOR_EXTENT(a,1);
|
||||
xcount = GFC_DESCRIPTOR_EXTENT(a,0);
|
||||
}
|
||||
|
||||
if (count != GFC_DESCRIPTOR_EXTENT(b,0))
|
||||
{
|
||||
if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
|
||||
runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
|
||||
}
|
||||
|
||||
if (GFC_DESCRIPTOR_RANK (b) == 1)
|
||||
{
|
||||
/* Treat it as a column matrix B[count,1] */
|
||||
bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
|
||||
|
||||
/* bystride should never be used for 1-dimensional b.
|
||||
in case it is we want it to cause a segfault, rather than
|
||||
an incorrect result. */
|
||||
bystride = 0xDEADBEEF;
|
||||
ycount = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
|
||||
bystride = GFC_DESCRIPTOR_STRIDE(b,1);
|
||||
ycount = GFC_DESCRIPTOR_EXTENT(b,1);
|
||||
}
|
||||
|
||||
abase = a->base_addr;
|
||||
bbase = b->base_addr;
|
||||
dest = retarray->base_addr;
|
||||
|
||||
/* Now that everything is set up, we perform the multiplication
|
||||
itself. */
|
||||
|
||||
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
|
||||
#define min(a,b) ((a) <= (b) ? (a) : (b))
|
||||
#define max(a,b) ((a) >= (b) ? (a) : (b))
|
||||
|
||||
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
|
||||
&& (bxstride == 1 || bystride == 1)
|
||||
&& (((float) xcount) * ((float) ycount) * ((float) count)
|
||||
> POW3(blas_limit)))
|
||||
{
|
||||
const int m = xcount, n = ycount, k = count, ldc = rystride;
|
||||
const 'rtype_name` one = 1, zero = 0;
|
||||
const int lda = (axstride == 1) ? aystride : axstride,
|
||||
ldb = (bxstride == 1) ? bystride : bxstride;
|
||||
|
||||
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
|
||||
{
|
||||
assert (gemm != NULL);
|
||||
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
|
||||
&n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
|
||||
&ldc, 1, 1);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
||||
{
|
||||
/* This block of code implements a tuned matmul, derived from
|
||||
Superscalar GEMM-based level 3 BLAS, Beta version 0.1
|
||||
|
||||
Bo Kagstrom and Per Ling
|
||||
Department of Computing Science
|
||||
Umea University
|
||||
S-901 87 Umea, Sweden
|
||||
|
||||
from netlib.org, translated to C, and modified for matmul.m4. */
|
||||
|
||||
const 'rtype_name` *a, *b;
|
||||
'rtype_name` *c;
|
||||
const index_type m = xcount, n = ycount, k = count;
|
||||
|
||||
/* System generated locals */
|
||||
index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
|
||||
i1, i2, i3, i4, i5, i6;
|
||||
|
||||
/* Local variables */
|
||||
'rtype_name` t1[65536], /* was [256][256] */
|
||||
f11, f12, f21, f22, f31, f32, f41, f42,
|
||||
f13, f14, f23, f24, f33, f34, f43, f44;
|
||||
index_type i, j, l, ii, jj, ll;
|
||||
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
|
||||
|
||||
a = abase;
|
||||
b = bbase;
|
||||
c = retarray->base_addr;
|
||||
|
||||
/* Parameter adjustments */
|
||||
c_dim1 = rystride;
|
||||
c_offset = 1 + c_dim1;
|
||||
c -= c_offset;
|
||||
a_dim1 = aystride;
|
||||
a_offset = 1 + a_dim1;
|
||||
a -= a_offset;
|
||||
b_dim1 = bystride;
|
||||
b_offset = 1 + b_dim1;
|
||||
b -= b_offset;
|
||||
|
||||
/* Early exit if possible */
|
||||
if (m == 0 || n == 0 || k == 0)
|
||||
return;
|
||||
|
||||
/* Empty c first. */
|
||||
for (j=1; j<=n; j++)
|
||||
for (i=1; i<=m; i++)
|
||||
c[i + j * c_dim1] = ('rtype_name`)0;
|
||||
|
||||
/* Start turning the crank. */
|
||||
i1 = n;
|
||||
for (jj = 1; jj <= i1; jj += 512)
|
||||
{
|
||||
/* Computing MIN */
|
||||
i2 = 512;
|
||||
i3 = n - jj + 1;
|
||||
jsec = min(i2,i3);
|
||||
ujsec = jsec - jsec % 4;
|
||||
i2 = k;
|
||||
for (ll = 1; ll <= i2; ll += 256)
|
||||
/* Run down the available processors in order of preference. */
|
||||
#ifdef HAVE_AVX512F
|
||||
if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
|
||||
{
|
||||
/* Computing MIN */
|
||||
i3 = 256;
|
||||
i4 = k - ll + 1;
|
||||
lsec = min(i3,i4);
|
||||
ulsec = lsec - lsec % 2;
|
||||
|
||||
i3 = m;
|
||||
for (ii = 1; ii <= i3; ii += 256)
|
||||
{
|
||||
/* Computing MIN */
|
||||
i4 = 256;
|
||||
i5 = m - ii + 1;
|
||||
isec = min(i4,i5);
|
||||
uisec = isec - isec % 2;
|
||||
i4 = ll + ulsec - 1;
|
||||
for (l = ll; l <= i4; l += 2)
|
||||
{
|
||||
i5 = ii + uisec - 1;
|
||||
for (i = ii; i <= i5; i += 2)
|
||||
{
|
||||
t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
|
||||
a[i + l * a_dim1];
|
||||
t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
|
||||
a[i + (l + 1) * a_dim1];
|
||||
t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
|
||||
a[i + 1 + l * a_dim1];
|
||||
t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
|
||||
a[i + 1 + (l + 1) * a_dim1];
|
||||
}
|
||||
if (uisec < isec)
|
||||
{
|
||||
t1[l - ll + 1 + (isec << 8) - 257] =
|
||||
a[ii + isec - 1 + l * a_dim1];
|
||||
t1[l - ll + 2 + (isec << 8) - 257] =
|
||||
a[ii + isec - 1 + (l + 1) * a_dim1];
|
||||
}
|
||||
}
|
||||
if (ulsec < lsec)
|
||||
{
|
||||
i4 = ii + isec - 1;
|
||||
for (i = ii; i<= i4; ++i)
|
||||
{
|
||||
t1[lsec + ((i - ii + 1) << 8) - 257] =
|
||||
a[i + (ll + lsec - 1) * a_dim1];
|
||||
}
|
||||
}
|
||||
|
||||
uisec = isec - isec % 4;
|
||||
i4 = jj + ujsec - 1;
|
||||
for (j = jj; j <= i4; j += 4)
|
||||
{
|
||||
i5 = ii + uisec - 1;
|
||||
for (i = ii; i <= i5; i += 4)
|
||||
{
|
||||
f11 = c[i + j * c_dim1];
|
||||
f21 = c[i + 1 + j * c_dim1];
|
||||
f12 = c[i + (j + 1) * c_dim1];
|
||||
f22 = c[i + 1 + (j + 1) * c_dim1];
|
||||
f13 = c[i + (j + 2) * c_dim1];
|
||||
f23 = c[i + 1 + (j + 2) * c_dim1];
|
||||
f14 = c[i + (j + 3) * c_dim1];
|
||||
f24 = c[i + 1 + (j + 3) * c_dim1];
|
||||
f31 = c[i + 2 + j * c_dim1];
|
||||
f41 = c[i + 3 + j * c_dim1];
|
||||
f32 = c[i + 2 + (j + 1) * c_dim1];
|
||||
f42 = c[i + 3 + (j + 1) * c_dim1];
|
||||
f33 = c[i + 2 + (j + 2) * c_dim1];
|
||||
f43 = c[i + 3 + (j + 2) * c_dim1];
|
||||
f34 = c[i + 2 + (j + 3) * c_dim1];
|
||||
f44 = c[i + 3 + (j + 3) * c_dim1];
|
||||
i6 = ll + lsec - 1;
|
||||
for (l = ll; l <= i6; ++l)
|
||||
{
|
||||
f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
||||
* b[l + j * b_dim1];
|
||||
f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
||||
* b[l + j * b_dim1];
|
||||
f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
||||
* b[l + (j + 1) * b_dim1];
|
||||
f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
||||
* b[l + (j + 1) * b_dim1];
|
||||
f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
||||
* b[l + (j + 2) * b_dim1];
|
||||
f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
||||
* b[l + (j + 2) * b_dim1];
|
||||
f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
||||
* b[l + (j + 3) * b_dim1];
|
||||
f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
||||
* b[l + (j + 3) * b_dim1];
|
||||
f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
||||
* b[l + j * b_dim1];
|
||||
f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
||||
* b[l + j * b_dim1];
|
||||
f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
||||
* b[l + (j + 1) * b_dim1];
|
||||
f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
||||
* b[l + (j + 1) * b_dim1];
|
||||
f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
||||
* b[l + (j + 2) * b_dim1];
|
||||
f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
||||
* b[l + (j + 2) * b_dim1];
|
||||
f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
||||
* b[l + (j + 3) * b_dim1];
|
||||
f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
||||
* b[l + (j + 3) * b_dim1];
|
||||
}
|
||||
c[i + j * c_dim1] = f11;
|
||||
c[i + 1 + j * c_dim1] = f21;
|
||||
c[i + (j + 1) * c_dim1] = f12;
|
||||
c[i + 1 + (j + 1) * c_dim1] = f22;
|
||||
c[i + (j + 2) * c_dim1] = f13;
|
||||
c[i + 1 + (j + 2) * c_dim1] = f23;
|
||||
c[i + (j + 3) * c_dim1] = f14;
|
||||
c[i + 1 + (j + 3) * c_dim1] = f24;
|
||||
c[i + 2 + j * c_dim1] = f31;
|
||||
c[i + 3 + j * c_dim1] = f41;
|
||||
c[i + 2 + (j + 1) * c_dim1] = f32;
|
||||
c[i + 3 + (j + 1) * c_dim1] = f42;
|
||||
c[i + 2 + (j + 2) * c_dim1] = f33;
|
||||
c[i + 3 + (j + 2) * c_dim1] = f43;
|
||||
c[i + 2 + (j + 3) * c_dim1] = f34;
|
||||
c[i + 3 + (j + 3) * c_dim1] = f44;
|
||||
}
|
||||
if (uisec < isec)
|
||||
{
|
||||
i5 = ii + isec - 1;
|
||||
for (i = ii + uisec; i <= i5; ++i)
|
||||
{
|
||||
f11 = c[i + j * c_dim1];
|
||||
f12 = c[i + (j + 1) * c_dim1];
|
||||
f13 = c[i + (j + 2) * c_dim1];
|
||||
f14 = c[i + (j + 3) * c_dim1];
|
||||
i6 = ll + lsec - 1;
|
||||
for (l = ll; l <= i6; ++l)
|
||||
{
|
||||
f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
||||
257] * b[l + j * b_dim1];
|
||||
f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
||||
257] * b[l + (j + 1) * b_dim1];
|
||||
f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
||||
257] * b[l + (j + 2) * b_dim1];
|
||||
f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
||||
257] * b[l + (j + 3) * b_dim1];
|
||||
}
|
||||
c[i + j * c_dim1] = f11;
|
||||
c[i + (j + 1) * c_dim1] = f12;
|
||||
c[i + (j + 2) * c_dim1] = f13;
|
||||
c[i + (j + 3) * c_dim1] = f14;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (ujsec < jsec)
|
||||
{
|
||||
i4 = jj + jsec - 1;
|
||||
for (j = jj + ujsec; j <= i4; ++j)
|
||||
{
|
||||
i5 = ii + uisec - 1;
|
||||
for (i = ii; i <= i5; i += 4)
|
||||
{
|
||||
f11 = c[i + j * c_dim1];
|
||||
f21 = c[i + 1 + j * c_dim1];
|
||||
f31 = c[i + 2 + j * c_dim1];
|
||||
f41 = c[i + 3 + j * c_dim1];
|
||||
i6 = ll + lsec - 1;
|
||||
for (l = ll; l <= i6; ++l)
|
||||
{
|
||||
f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
||||
257] * b[l + j * b_dim1];
|
||||
f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
|
||||
257] * b[l + j * b_dim1];
|
||||
f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
|
||||
257] * b[l + j * b_dim1];
|
||||
f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
|
||||
257] * b[l + j * b_dim1];
|
||||
}
|
||||
c[i + j * c_dim1] = f11;
|
||||
c[i + 1 + j * c_dim1] = f21;
|
||||
c[i + 2 + j * c_dim1] = f31;
|
||||
c[i + 3 + j * c_dim1] = f41;
|
||||
}
|
||||
i5 = ii + isec - 1;
|
||||
for (i = ii + uisec; i <= i5; ++i)
|
||||
{
|
||||
f11 = c[i + j * c_dim1];
|
||||
i6 = ll + lsec - 1;
|
||||
for (l = ll; l <= i6; ++l)
|
||||
{
|
||||
f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
||||
257] * b[l + j * b_dim1];
|
||||
}
|
||||
c[i + j * c_dim1] = f11;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
matmul_p = matmul_'rtype_code`_avx512f;
|
||||
goto tailcall;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||
{
|
||||
if (GFC_DESCRIPTOR_RANK (a) != 1)
|
||||
{
|
||||
const 'rtype_name` *restrict abase_x;
|
||||
const 'rtype_name` *restrict bbase_y;
|
||||
'rtype_name` *restrict dest_y;
|
||||
'rtype_name` s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
#ifdef HAVE_AVX2
|
||||
if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = ('rtype_name`) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n] * bbase_y[n];
|
||||
dest_y[x] = s;
|
||||
}
|
||||
matmul_p = matmul_'rtype_code`_avx2;
|
||||
goto tailcall;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const 'rtype_name` *restrict bbase_y;
|
||||
'rtype_name` s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
s = ('rtype_name`) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase[n*axstride] * bbase_y[n];
|
||||
dest[y*rystride] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (axstride < aystride)
|
||||
{
|
||||
for (y = 0; y < ycount; y++)
|
||||
for (x = 0; x < xcount; x++)
|
||||
dest[x*rxstride + y*rystride] = ('rtype_name`)0;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
for (n = 0; n < count; n++)
|
||||
for (x = 0; x < xcount; x++)
|
||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||
dest[x*rxstride + y*rystride] +=
|
||||
abase[x*axstride + n*aystride] *
|
||||
bbase[n*bxstride + y*bystride];
|
||||
}
|
||||
else if (GFC_DESCRIPTOR_RANK (a) == 1)
|
||||
{
|
||||
const 'rtype_name` *restrict bbase_y;
|
||||
'rtype_name` s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
s = ('rtype_name`) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase[n*axstride] * bbase_y[n*bxstride];
|
||||
dest[y*rxstride] = s;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const 'rtype_name` *restrict abase_x;
|
||||
const 'rtype_name` *restrict bbase_y;
|
||||
'rtype_name` *restrict dest_y;
|
||||
'rtype_name` s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = ('rtype_name`) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||
dest_y[x*rxstride] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}'
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_AVX
|
||||
if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
{
|
||||
matmul_p = matmul_'rtype_code`_avx;
|
||||
goto tailcall;
|
||||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
}
|
||||
|
||||
tailcall:
|
||||
(*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm);
|
||||
}
|
||||
|
||||
#else /* Just the vanilla function. */
|
||||
|
||||
'define(`matmul_name',`matmul_'rtype_code)dnl
|
||||
define(`target_attribute',`')dnl
|
||||
include(matmul_internal.m4)dnl
|
||||
`#endif
|
||||
#endif
|
||||
'
|
||||
|
537
libgfortran/m4/matmul_internal.m4
Normal file
537
libgfortran/m4/matmul_internal.m4
Normal file
@ -0,0 +1,537 @@
|
||||
`void
|
||||
'matmul_name` ('rtype` * const restrict retarray,
|
||||
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm)
|
||||
{
|
||||
const 'rtype_name` * restrict abase;
|
||||
const 'rtype_name` * restrict bbase;
|
||||
'rtype_name` * restrict dest;
|
||||
|
||||
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
|
||||
index_type x, y, n, count, xcount, ycount;
|
||||
|
||||
assert (GFC_DESCRIPTOR_RANK (a) == 2
|
||||
|| GFC_DESCRIPTOR_RANK (b) == 2);
|
||||
|
||||
/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
|
||||
|
||||
Either A or B (but not both) can be rank 1:
|
||||
|
||||
o One-dimensional argument A is implicitly treated as a row matrix
|
||||
dimensioned [1,count], so xcount=1.
|
||||
|
||||
o One-dimensional argument B is implicitly treated as a column matrix
|
||||
dimensioned [count, 1], so ycount=1.
|
||||
*/
|
||||
|
||||
if (retarray->base_addr == NULL)
|
||||
{
|
||||
if (GFC_DESCRIPTOR_RANK (a) == 1)
|
||||
{
|
||||
GFC_DIMENSION_SET(retarray->dim[0], 0,
|
||||
GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
|
||||
}
|
||||
else if (GFC_DESCRIPTOR_RANK (b) == 1)
|
||||
{
|
||||
GFC_DIMENSION_SET(retarray->dim[0], 0,
|
||||
GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
GFC_DIMENSION_SET(retarray->dim[0], 0,
|
||||
GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
|
||||
|
||||
GFC_DIMENSION_SET(retarray->dim[1], 0,
|
||||
GFC_DESCRIPTOR_EXTENT(b,1) - 1,
|
||||
GFC_DESCRIPTOR_EXTENT(retarray,0));
|
||||
}
|
||||
|
||||
retarray->base_addr
|
||||
= xmallocarray (size0 ((array_t *) retarray), sizeof ('rtype_name`));
|
||||
retarray->offset = 0;
|
||||
}
|
||||
else if (unlikely (compile_options.bounds_check))
|
||||
{
|
||||
index_type ret_extent, arg_extent;
|
||||
|
||||
if (GFC_DESCRIPTOR_RANK (a) == 1)
|
||||
{
|
||||
arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
|
||||
ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
||||
if (arg_extent != ret_extent)
|
||||
runtime_error ("Incorrect extent in return array in"
|
||||
" MATMUL intrinsic: is %ld, should be %ld",
|
||||
(long int) ret_extent, (long int) arg_extent);
|
||||
}
|
||||
else if (GFC_DESCRIPTOR_RANK (b) == 1)
|
||||
{
|
||||
arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
|
||||
ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
||||
if (arg_extent != ret_extent)
|
||||
runtime_error ("Incorrect extent in return array in"
|
||||
" MATMUL intrinsic: is %ld, should be %ld",
|
||||
(long int) ret_extent, (long int) arg_extent);
|
||||
}
|
||||
else
|
||||
{
|
||||
arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
|
||||
ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
|
||||
if (arg_extent != ret_extent)
|
||||
runtime_error ("Incorrect extent in return array in"
|
||||
" MATMUL intrinsic for dimension 1:"
|
||||
" is %ld, should be %ld",
|
||||
(long int) ret_extent, (long int) arg_extent);
|
||||
|
||||
arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
|
||||
ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
|
||||
if (arg_extent != ret_extent)
|
||||
runtime_error ("Incorrect extent in return array in"
|
||||
" MATMUL intrinsic for dimension 2:"
|
||||
" is %ld, should be %ld",
|
||||
(long int) ret_extent, (long int) arg_extent);
|
||||
}
|
||||
}
|
||||
'
|
||||
sinclude(`matmul_asm_'rtype_code`.m4')dnl
|
||||
`
|
||||
if (GFC_DESCRIPTOR_RANK (retarray) == 1)
|
||||
{
|
||||
/* One-dimensional result may be addressed in the code below
|
||||
either as a row or a column matrix. We want both cases to
|
||||
work. */
|
||||
rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
|
||||
}
|
||||
else
|
||||
{
|
||||
rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
|
||||
rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
|
||||
}
|
||||
|
||||
|
||||
if (GFC_DESCRIPTOR_RANK (a) == 1)
|
||||
{
|
||||
/* Treat it as a a row matrix A[1,count]. */
|
||||
axstride = GFC_DESCRIPTOR_STRIDE(a,0);
|
||||
aystride = 1;
|
||||
|
||||
xcount = 1;
|
||||
count = GFC_DESCRIPTOR_EXTENT(a,0);
|
||||
}
|
||||
else
|
||||
{
|
||||
axstride = GFC_DESCRIPTOR_STRIDE(a,0);
|
||||
aystride = GFC_DESCRIPTOR_STRIDE(a,1);
|
||||
|
||||
count = GFC_DESCRIPTOR_EXTENT(a,1);
|
||||
xcount = GFC_DESCRIPTOR_EXTENT(a,0);
|
||||
}
|
||||
|
||||
if (count != GFC_DESCRIPTOR_EXTENT(b,0))
|
||||
{
|
||||
if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
|
||||
runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
|
||||
}
|
||||
|
||||
if (GFC_DESCRIPTOR_RANK (b) == 1)
|
||||
{
|
||||
/* Treat it as a column matrix B[count,1] */
|
||||
bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
|
||||
|
||||
/* bystride should never be used for 1-dimensional b.
|
||||
in case it is we want it to cause a segfault, rather than
|
||||
an incorrect result. */
|
||||
bystride = 0xDEADBEEF;
|
||||
ycount = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
|
||||
bystride = GFC_DESCRIPTOR_STRIDE(b,1);
|
||||
ycount = GFC_DESCRIPTOR_EXTENT(b,1);
|
||||
}
|
||||
|
||||
abase = a->base_addr;
|
||||
bbase = b->base_addr;
|
||||
dest = retarray->base_addr;
|
||||
|
||||
/* Now that everything is set up, we perform the multiplication
|
||||
itself. */
|
||||
|
||||
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
|
||||
#define min(a,b) ((a) <= (b) ? (a) : (b))
|
||||
#define max(a,b) ((a) >= (b) ? (a) : (b))
|
||||
|
||||
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
|
||||
&& (bxstride == 1 || bystride == 1)
|
||||
&& (((float) xcount) * ((float) ycount) * ((float) count)
|
||||
> POW3(blas_limit)))
|
||||
{
|
||||
const int m = xcount, n = ycount, k = count, ldc = rystride;
|
||||
const 'rtype_name` one = 1, zero = 0;
|
||||
const int lda = (axstride == 1) ? aystride : axstride,
|
||||
ldb = (bxstride == 1) ? bystride : bxstride;
|
||||
|
||||
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
|
||||
{
|
||||
assert (gemm != NULL);
|
||||
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
|
||||
&n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
|
||||
&ldc, 1, 1);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
||||
{
|
||||
/* This block of code implements a tuned matmul, derived from
|
||||
Superscalar GEMM-based level 3 BLAS, Beta version 0.1
|
||||
|
||||
Bo Kagstrom and Per Ling
|
||||
Department of Computing Science
|
||||
Umea University
|
||||
S-901 87 Umea, Sweden
|
||||
|
||||
from netlib.org, translated to C, and modified for matmul.m4. */
|
||||
|
||||
const 'rtype_name` *a, *b;
|
||||
'rtype_name` *c;
|
||||
const index_type m = xcount, n = ycount, k = count;
|
||||
|
||||
/* System generated locals */
|
||||
index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
|
||||
i1, i2, i3, i4, i5, i6;
|
||||
|
||||
/* Local variables */
|
||||
'rtype_name` t1[65536], /* was [256][256] */
|
||||
f11, f12, f21, f22, f31, f32, f41, f42,
|
||||
f13, f14, f23, f24, f33, f34, f43, f44;
|
||||
index_type i, j, l, ii, jj, ll;
|
||||
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
|
||||
|
||||
a = abase;
|
||||
b = bbase;
|
||||
c = retarray->base_addr;
|
||||
|
||||
/* Parameter adjustments */
|
||||
c_dim1 = rystride;
|
||||
c_offset = 1 + c_dim1;
|
||||
c -= c_offset;
|
||||
a_dim1 = aystride;
|
||||
a_offset = 1 + a_dim1;
|
||||
a -= a_offset;
|
||||
b_dim1 = bystride;
|
||||
b_offset = 1 + b_dim1;
|
||||
b -= b_offset;
|
||||
|
||||
/* Early exit if possible */
|
||||
if (m == 0 || n == 0 || k == 0)
|
||||
return;
|
||||
|
||||
/* Empty c first. */
|
||||
for (j=1; j<=n; j++)
|
||||
for (i=1; i<=m; i++)
|
||||
c[i + j * c_dim1] = ('rtype_name`)0;
|
||||
|
||||
/* Start turning the crank. */
|
||||
i1 = n;
|
||||
for (jj = 1; jj <= i1; jj += 512)
|
||||
{
|
||||
/* Computing MIN */
|
||||
i2 = 512;
|
||||
i3 = n - jj + 1;
|
||||
jsec = min(i2,i3);
|
||||
ujsec = jsec - jsec % 4;
|
||||
i2 = k;
|
||||
for (ll = 1; ll <= i2; ll += 256)
|
||||
{
|
||||
/* Computing MIN */
|
||||
i3 = 256;
|
||||
i4 = k - ll + 1;
|
||||
lsec = min(i3,i4);
|
||||
ulsec = lsec - lsec % 2;
|
||||
|
||||
i3 = m;
|
||||
for (ii = 1; ii <= i3; ii += 256)
|
||||
{
|
||||
/* Computing MIN */
|
||||
i4 = 256;
|
||||
i5 = m - ii + 1;
|
||||
isec = min(i4,i5);
|
||||
uisec = isec - isec % 2;
|
||||
i4 = ll + ulsec - 1;
|
||||
for (l = ll; l <= i4; l += 2)
|
||||
{
|
||||
i5 = ii + uisec - 1;
|
||||
for (i = ii; i <= i5; i += 2)
|
||||
{
|
||||
t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
|
||||
a[i + l * a_dim1];
|
||||
t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
|
||||
a[i + (l + 1) * a_dim1];
|
||||
t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
|
||||
a[i + 1 + l * a_dim1];
|
||||
t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
|
||||
a[i + 1 + (l + 1) * a_dim1];
|
||||
}
|
||||
if (uisec < isec)
|
||||
{
|
||||
t1[l - ll + 1 + (isec << 8) - 257] =
|
||||
a[ii + isec - 1 + l * a_dim1];
|
||||
t1[l - ll + 2 + (isec << 8) - 257] =
|
||||
a[ii + isec - 1 + (l + 1) * a_dim1];
|
||||
}
|
||||
}
|
||||
if (ulsec < lsec)
|
||||
{
|
||||
i4 = ii + isec - 1;
|
||||
for (i = ii; i<= i4; ++i)
|
||||
{
|
||||
t1[lsec + ((i - ii + 1) << 8) - 257] =
|
||||
a[i + (ll + lsec - 1) * a_dim1];
|
||||
}
|
||||
}
|
||||
|
||||
uisec = isec - isec % 4;
|
||||
i4 = jj + ujsec - 1;
|
||||
for (j = jj; j <= i4; j += 4)
|
||||
{
|
||||
i5 = ii + uisec - 1;
|
||||
for (i = ii; i <= i5; i += 4)
|
||||
{
|
||||
f11 = c[i + j * c_dim1];
|
||||
f21 = c[i + 1 + j * c_dim1];
|
||||
f12 = c[i + (j + 1) * c_dim1];
|
||||
f22 = c[i + 1 + (j + 1) * c_dim1];
|
||||
f13 = c[i + (j + 2) * c_dim1];
|
||||
f23 = c[i + 1 + (j + 2) * c_dim1];
|
||||
f14 = c[i + (j + 3) * c_dim1];
|
||||
f24 = c[i + 1 + (j + 3) * c_dim1];
|
||||
f31 = c[i + 2 + j * c_dim1];
|
||||
f41 = c[i + 3 + j * c_dim1];
|
||||
f32 = c[i + 2 + (j + 1) * c_dim1];
|
||||
f42 = c[i + 3 + (j + 1) * c_dim1];
|
||||
f33 = c[i + 2 + (j + 2) * c_dim1];
|
||||
f43 = c[i + 3 + (j + 2) * c_dim1];
|
||||
f34 = c[i + 2 + (j + 3) * c_dim1];
|
||||
f44 = c[i + 3 + (j + 3) * c_dim1];
|
||||
i6 = ll + lsec - 1;
|
||||
for (l = ll; l <= i6; ++l)
|
||||
{
|
||||
f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
||||
* b[l + j * b_dim1];
|
||||
f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
||||
* b[l + j * b_dim1];
|
||||
f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
||||
* b[l + (j + 1) * b_dim1];
|
||||
f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
||||
* b[l + (j + 1) * b_dim1];
|
||||
f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
||||
* b[l + (j + 2) * b_dim1];
|
||||
f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
||||
* b[l + (j + 2) * b_dim1];
|
||||
f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
|
||||
* b[l + (j + 3) * b_dim1];
|
||||
f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
|
||||
* b[l + (j + 3) * b_dim1];
|
||||
f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
||||
* b[l + j * b_dim1];
|
||||
f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
||||
* b[l + j * b_dim1];
|
||||
f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
||||
* b[l + (j + 1) * b_dim1];
|
||||
f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
||||
* b[l + (j + 1) * b_dim1];
|
||||
f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
||||
* b[l + (j + 2) * b_dim1];
|
||||
f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
||||
* b[l + (j + 2) * b_dim1];
|
||||
f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
|
||||
* b[l + (j + 3) * b_dim1];
|
||||
f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
|
||||
* b[l + (j + 3) * b_dim1];
|
||||
}
|
||||
c[i + j * c_dim1] = f11;
|
||||
c[i + 1 + j * c_dim1] = f21;
|
||||
c[i + (j + 1) * c_dim1] = f12;
|
||||
c[i + 1 + (j + 1) * c_dim1] = f22;
|
||||
c[i + (j + 2) * c_dim1] = f13;
|
||||
c[i + 1 + (j + 2) * c_dim1] = f23;
|
||||
c[i + (j + 3) * c_dim1] = f14;
|
||||
c[i + 1 + (j + 3) * c_dim1] = f24;
|
||||
c[i + 2 + j * c_dim1] = f31;
|
||||
c[i + 3 + j * c_dim1] = f41;
|
||||
c[i + 2 + (j + 1) * c_dim1] = f32;
|
||||
c[i + 3 + (j + 1) * c_dim1] = f42;
|
||||
c[i + 2 + (j + 2) * c_dim1] = f33;
|
||||
c[i + 3 + (j + 2) * c_dim1] = f43;
|
||||
c[i + 2 + (j + 3) * c_dim1] = f34;
|
||||
c[i + 3 + (j + 3) * c_dim1] = f44;
|
||||
}
|
||||
if (uisec < isec)
|
||||
{
|
||||
i5 = ii + isec - 1;
|
||||
for (i = ii + uisec; i <= i5; ++i)
|
||||
{
|
||||
f11 = c[i + j * c_dim1];
|
||||
f12 = c[i + (j + 1) * c_dim1];
|
||||
f13 = c[i + (j + 2) * c_dim1];
|
||||
f14 = c[i + (j + 3) * c_dim1];
|
||||
i6 = ll + lsec - 1;
|
||||
for (l = ll; l <= i6; ++l)
|
||||
{
|
||||
f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
||||
257] * b[l + j * b_dim1];
|
||||
f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
||||
257] * b[l + (j + 1) * b_dim1];
|
||||
f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
||||
257] * b[l + (j + 2) * b_dim1];
|
||||
f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
||||
257] * b[l + (j + 3) * b_dim1];
|
||||
}
|
||||
c[i + j * c_dim1] = f11;
|
||||
c[i + (j + 1) * c_dim1] = f12;
|
||||
c[i + (j + 2) * c_dim1] = f13;
|
||||
c[i + (j + 3) * c_dim1] = f14;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (ujsec < jsec)
|
||||
{
|
||||
i4 = jj + jsec - 1;
|
||||
for (j = jj + ujsec; j <= i4; ++j)
|
||||
{
|
||||
i5 = ii + uisec - 1;
|
||||
for (i = ii; i <= i5; i += 4)
|
||||
{
|
||||
f11 = c[i + j * c_dim1];
|
||||
f21 = c[i + 1 + j * c_dim1];
|
||||
f31 = c[i + 2 + j * c_dim1];
|
||||
f41 = c[i + 3 + j * c_dim1];
|
||||
i6 = ll + lsec - 1;
|
||||
for (l = ll; l <= i6; ++l)
|
||||
{
|
||||
f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
||||
257] * b[l + j * b_dim1];
|
||||
f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
|
||||
257] * b[l + j * b_dim1];
|
||||
f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
|
||||
257] * b[l + j * b_dim1];
|
||||
f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
|
||||
257] * b[l + j * b_dim1];
|
||||
}
|
||||
c[i + j * c_dim1] = f11;
|
||||
c[i + 1 + j * c_dim1] = f21;
|
||||
c[i + 2 + j * c_dim1] = f31;
|
||||
c[i + 3 + j * c_dim1] = f41;
|
||||
}
|
||||
i5 = ii + isec - 1;
|
||||
for (i = ii + uisec; i <= i5; ++i)
|
||||
{
|
||||
f11 = c[i + j * c_dim1];
|
||||
i6 = ll + lsec - 1;
|
||||
for (l = ll; l <= i6; ++l)
|
||||
{
|
||||
f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
|
||||
257] * b[l + j * b_dim1];
|
||||
}
|
||||
c[i + j * c_dim1] = f11;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||
{
|
||||
if (GFC_DESCRIPTOR_RANK (a) != 1)
|
||||
{
|
||||
const 'rtype_name` *restrict abase_x;
|
||||
const 'rtype_name` *restrict bbase_y;
|
||||
'rtype_name` *restrict dest_y;
|
||||
'rtype_name` s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = ('rtype_name`) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n] * bbase_y[n];
|
||||
dest_y[x] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const 'rtype_name` *restrict bbase_y;
|
||||
'rtype_name` s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
s = ('rtype_name`) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase[n*axstride] * bbase_y[n];
|
||||
dest[y*rystride] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (axstride < aystride)
|
||||
{
|
||||
for (y = 0; y < ycount; y++)
|
||||
for (x = 0; x < xcount; x++)
|
||||
dest[x*rxstride + y*rystride] = ('rtype_name`)0;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
for (n = 0; n < count; n++)
|
||||
for (x = 0; x < xcount; x++)
|
||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||
dest[x*rxstride + y*rystride] +=
|
||||
abase[x*axstride + n*aystride] *
|
||||
bbase[n*bxstride + y*bystride];
|
||||
}
|
||||
else if (GFC_DESCRIPTOR_RANK (a) == 1)
|
||||
{
|
||||
const 'rtype_name` *restrict bbase_y;
|
||||
'rtype_name` s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
s = ('rtype_name`) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase[n*axstride] * bbase_y[n*bxstride];
|
||||
dest[y*rxstride] = s;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const 'rtype_name` *restrict abase_x;
|
||||
const 'rtype_name` *restrict bbase_y;
|
||||
'rtype_name` *restrict dest_y;
|
||||
'rtype_name` s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = ('rtype_name`) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||
dest_y[x*rxstride] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#undef POW3
|
||||
#undef min
|
||||
#undef max
|
||||
'
|
Loading…
x
Reference in New Issue
Block a user