mirror of
git://gcc.gnu.org/git/gcc.git
synced 2024-12-11 20:41:01 +08:00
Make-lang.in (fortran/trans-resolve.o): Depend on fortran/dependency.h.
gcc/fortran/ * Make-lang.in (fortran/trans-resolve.o): Depend on fortran/dependency.h. * gfortran.h (gfc_expr): Add an "inline_noncopying_intrinsic" flag. * dependency.h (gfc_get_noncopying_intrinsic_argument): Declare. (gfc_check_fncall_dependency): Change prototype. * dependency.c (gfc_get_noncopying_intrinsic_argument): New function. (gfc_check_argument_var_dependency): New function, split from gfc_check_fncall_dependency. (gfc_check_argument_dependency): New function. (gfc_check_fncall_dependency): Replace the expression parameter with separate symbol and argument list parameters. Generalize the function to handle dependencies for any type of expression, not just variables. Accept a further argument giving the intent of the expression being tested. Ignore intent(in) arguments if that expression is also intent(in). * resolve.c: Include dependency.h. (find_noncopying_intrinsics): New function. (resolve_function, resolve_call): Call it on success. * trans-array.h (gfc_conv_array_transpose): Declare. (gfc_check_fncall_dependency): Remove prototype. * trans-array.c (gfc_conv_array_transpose): New function. * trans-intrinsic.c (gfc_conv_intrinsic_function): Don't use the libcall handling if the expression is to be evaluated inline. Add a case for handling inline transpose()s. * trans-expr.c (gfc_trans_arrayfunc_assign): Adjust for the new interface provided by gfc_check_fncall_dependency. libgfortran/ * m4/matmul.m4: Use a different order in the special case of a transposed first argument. * generated/matmul_c4.c, generated/matmul_c8.c, generated/matmul_c10.c, * generated/matmul_c16.c, generated/matmul_i4.c, generated/matmul_i8.c, * generated/matmul_i10.c, generated/matmul_r4.c, generated/matmul_r8.c * generated/matmul_r10.c, generated/matmul_r16.c: Regenerated. Co-Authored-By: Victor Leikehman <LEI@il.ibm.com> From-SVN: r108459
This commit is contained in:
parent
264c41eda5
commit
1524f80b1c
@ -1,3 +1,32 @@
|
||||
2005-12-13 Richard Sandiford <richard@codesourcery.com>
|
||||
|
||||
* Make-lang.in (fortran/trans-resolve.o): Depend on
|
||||
fortran/dependency.h.
|
||||
* gfortran.h (gfc_expr): Add an "inline_noncopying_intrinsic" flag.
|
||||
* dependency.h (gfc_get_noncopying_intrinsic_argument): Declare.
|
||||
(gfc_check_fncall_dependency): Change prototype.
|
||||
* dependency.c (gfc_get_noncopying_intrinsic_argument): New function.
|
||||
(gfc_check_argument_var_dependency): New function, split from
|
||||
gfc_check_fncall_dependency.
|
||||
(gfc_check_argument_dependency): New function.
|
||||
(gfc_check_fncall_dependency): Replace the expression parameter with
|
||||
separate symbol and argument list parameters. Generalize the function
|
||||
to handle dependencies for any type of expression, not just variables.
|
||||
Accept a further argument giving the intent of the expression being
|
||||
tested. Ignore intent(in) arguments if that expression is also
|
||||
intent(in).
|
||||
* resolve.c: Include dependency.h.
|
||||
(find_noncopying_intrinsics): New function.
|
||||
(resolve_function, resolve_call): Call it on success.
|
||||
* trans-array.h (gfc_conv_array_transpose): Declare.
|
||||
(gfc_check_fncall_dependency): Remove prototype.
|
||||
* trans-array.c (gfc_conv_array_transpose): New function.
|
||||
* trans-intrinsic.c (gfc_conv_intrinsic_function): Don't use the
|
||||
libcall handling if the expression is to be evaluated inline.
|
||||
Add a case for handling inline transpose()s.
|
||||
* trans-expr.c (gfc_trans_arrayfunc_assign): Adjust for the new
|
||||
interface provided by gfc_check_fncall_dependency.
|
||||
|
||||
2005-12-12 Steven G. Kargl <kargls@comcast.net>
|
||||
|
||||
PR fortran/25078
|
||||
|
@ -286,4 +286,5 @@ fortran/trans-intrinsic.o: $(GFORTRAN_TRANS_DEPS) fortran/mathbuiltins.def \
|
||||
gt-fortran-trans-intrinsic.h
|
||||
fortran/dependency.o: $(GFORTRAN_TRANS_DEPS) fortran/dependency.h
|
||||
fortran/trans-common.o: $(GFORTRAN_TRANS_DEPS)
|
||||
fortran/resolve.o: fortran/dependency.h
|
||||
|
||||
|
@ -175,6 +175,32 @@ gfc_is_same_range (gfc_array_ref * ar1, gfc_array_ref * ar2, int n, int def)
|
||||
}
|
||||
|
||||
|
||||
/* Some array-returning intrinsics can be implemented by reusing the
|
||||
data from one of the array arguments. For example, TRANPOSE does
|
||||
not necessarily need to allocate new data: it can be implemented
|
||||
by copying the original array's descriptor and simply swapping the
|
||||
two dimension specifications.
|
||||
|
||||
If EXPR is a call to such an intrinsic, return the argument
|
||||
whose data can be reused, otherwise return NULL. */
|
||||
|
||||
gfc_expr *
|
||||
gfc_get_noncopying_intrinsic_argument (gfc_expr * expr)
|
||||
{
|
||||
if (expr->expr_type != EXPR_FUNCTION || !expr->value.function.isym)
|
||||
return NULL;
|
||||
|
||||
switch (expr->value.function.isym->generic_id)
|
||||
{
|
||||
case GFC_ISYM_TRANSPOSE:
|
||||
return expr->value.function.actual->expr;
|
||||
|
||||
default:
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Return true if the result of reference REF can only be constructed
|
||||
using a temporary array. */
|
||||
|
||||
@ -214,23 +240,82 @@ gfc_ref_needs_temporary_p (gfc_ref *ref)
|
||||
}
|
||||
|
||||
|
||||
/* Dependency checking for direct function return by reference.
|
||||
Returns true if the arguments of the function depend on the
|
||||
destination. This is considerably less conservative than other
|
||||
dependencies because many function arguments will already be
|
||||
copied into a temporary. */
|
||||
/* Return true if array variable VAR could be passed to the same function
|
||||
as argument EXPR without interfering with EXPR. INTENT is the intent
|
||||
of VAR.
|
||||
|
||||
This is considerably less conservative than other dependencies
|
||||
because many function arguments will already be copied into a
|
||||
temporary. */
|
||||
|
||||
static int
|
||||
gfc_check_argument_var_dependency (gfc_expr * var, sym_intent intent,
|
||||
gfc_expr * expr)
|
||||
{
|
||||
gcc_assert (var->expr_type == EXPR_VARIABLE);
|
||||
gcc_assert (var->rank > 0);
|
||||
|
||||
switch (expr->expr_type)
|
||||
{
|
||||
case EXPR_VARIABLE:
|
||||
return (gfc_ref_needs_temporary_p (expr->ref)
|
||||
|| gfc_check_dependency (var, expr, NULL, 0));
|
||||
|
||||
case EXPR_ARRAY:
|
||||
return gfc_check_dependency (var, expr, NULL, 0);
|
||||
|
||||
case EXPR_FUNCTION:
|
||||
if (intent != INTENT_IN && expr->inline_noncopying_intrinsic)
|
||||
{
|
||||
expr = gfc_get_noncopying_intrinsic_argument (expr);
|
||||
return gfc_check_argument_var_dependency (var, intent, expr);
|
||||
}
|
||||
return 0;
|
||||
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Like gfc_check_argument_var_dependency, but extended to any
|
||||
array expression OTHER, not just variables. */
|
||||
|
||||
static int
|
||||
gfc_check_argument_dependency (gfc_expr * other, sym_intent intent,
|
||||
gfc_expr * expr)
|
||||
{
|
||||
switch (other->expr_type)
|
||||
{
|
||||
case EXPR_VARIABLE:
|
||||
return gfc_check_argument_var_dependency (other, intent, expr);
|
||||
|
||||
case EXPR_FUNCTION:
|
||||
if (other->inline_noncopying_intrinsic)
|
||||
{
|
||||
other = gfc_get_noncopying_intrinsic_argument (other);
|
||||
return gfc_check_argument_dependency (other, INTENT_IN, expr);
|
||||
}
|
||||
return 0;
|
||||
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Like gfc_check_argument_dependency, but check all the arguments in ACTUAL.
|
||||
FNSYM is the function being called, or NULL if not known. */
|
||||
|
||||
int
|
||||
gfc_check_fncall_dependency (gfc_expr * dest, gfc_expr * fncall)
|
||||
gfc_check_fncall_dependency (gfc_expr * other, sym_intent intent,
|
||||
gfc_symbol * fnsym, gfc_actual_arglist * actual)
|
||||
{
|
||||
gfc_actual_arglist *actual;
|
||||
gfc_formal_arglist *formal;
|
||||
gfc_expr *expr;
|
||||
|
||||
gcc_assert (dest->expr_type == EXPR_VARIABLE
|
||||
&& fncall->expr_type == EXPR_FUNCTION);
|
||||
gcc_assert (fncall->rank > 0);
|
||||
|
||||
for (actual = fncall->value.function.actual; actual; actual = actual->next)
|
||||
formal = fnsym ? fnsym->formal : NULL;
|
||||
for (; actual; actual = actual->next, formal = formal ? formal->next : NULL)
|
||||
{
|
||||
expr = actual->expr;
|
||||
|
||||
@ -238,23 +323,14 @@ gfc_check_fncall_dependency (gfc_expr * dest, gfc_expr * fncall)
|
||||
if (!expr)
|
||||
continue;
|
||||
|
||||
/* Non-variable expressions will be allocated temporaries anyway. */
|
||||
switch (expr->expr_type)
|
||||
{
|
||||
case EXPR_VARIABLE:
|
||||
if (!gfc_ref_needs_temporary_p (expr->ref)
|
||||
&& gfc_check_dependency (dest, expr, NULL, 0))
|
||||
return 1;
|
||||
break;
|
||||
/* Skip intent(in) arguments if OTHER itself is intent(in). */
|
||||
if (formal
|
||||
&& intent == INTENT_IN
|
||||
&& formal->sym->attr.intent == INTENT_IN)
|
||||
continue;
|
||||
|
||||
case EXPR_ARRAY:
|
||||
if (gfc_check_dependency (dest, expr, NULL, 0))
|
||||
return 1;
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if (gfc_check_argument_dependency (other, intent, expr))
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -22,7 +22,9 @@ Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
|
||||
|
||||
|
||||
bool gfc_ref_needs_temporary_p (gfc_ref *);
|
||||
int gfc_check_fncall_dependency (gfc_expr *, gfc_expr *);
|
||||
gfc_expr *gfc_get_noncopying_intrinsic_argument (gfc_expr *);
|
||||
int gfc_check_fncall_dependency (gfc_expr *, sym_intent, gfc_symbol *,
|
||||
gfc_actual_arglist *);
|
||||
int gfc_check_dependency (gfc_expr *, gfc_expr *, gfc_expr **, int);
|
||||
int gfc_is_same_range (gfc_array_ref *, gfc_array_ref *, int, int);
|
||||
int gfc_dep_compare_expr (gfc_expr *, gfc_expr *);
|
||||
|
@ -1129,6 +1129,9 @@ typedef struct gfc_expr
|
||||
|
||||
/* True if it is converted from Hollerith constant. */
|
||||
unsigned int from_H : 1;
|
||||
/* True if the expression is a call to a function that returns an array,
|
||||
and if we have decided not to allocate temporary data for that array. */
|
||||
unsigned int inline_noncopying_intrinsic : 1;
|
||||
|
||||
union
|
||||
{
|
||||
|
@ -24,6 +24,7 @@ Software Foundation, 51 Franklin Street, Fifth Floor,Boston, MA
|
||||
#include "system.h"
|
||||
#include "gfortran.h"
|
||||
#include "arith.h" /* For gfc_compare_expr(). */
|
||||
#include "dependency.h"
|
||||
|
||||
/* Types used in equivalence statements. */
|
||||
|
||||
@ -804,6 +805,24 @@ resolve_actual_arglist (gfc_actual_arglist * arg)
|
||||
}
|
||||
|
||||
|
||||
/* Go through each actual argument in ACTUAL and see if it can be
|
||||
implemented as an inlined, non-copying intrinsic. FNSYM is the
|
||||
function being called, or NULL if not known. */
|
||||
|
||||
static void
|
||||
find_noncopying_intrinsics (gfc_symbol * fnsym, gfc_actual_arglist * actual)
|
||||
{
|
||||
gfc_actual_arglist *ap;
|
||||
gfc_expr *expr;
|
||||
|
||||
for (ap = actual; ap; ap = ap->next)
|
||||
if (ap->expr
|
||||
&& (expr = gfc_get_noncopying_intrinsic_argument (ap->expr))
|
||||
&& !gfc_check_fncall_dependency (expr, INTENT_IN, fnsym, actual))
|
||||
ap->expr->inline_noncopying_intrinsic = 1;
|
||||
}
|
||||
|
||||
|
||||
/************* Function resolution *************/
|
||||
|
||||
/* Resolve a function call known to be generic.
|
||||
@ -1150,6 +1169,9 @@ resolve_function (gfc_expr * expr)
|
||||
}
|
||||
}
|
||||
|
||||
if (t == SUCCESS)
|
||||
find_noncopying_intrinsics (expr->value.function.esym,
|
||||
expr->value.function.actual);
|
||||
return t;
|
||||
}
|
||||
|
||||
@ -1372,27 +1394,28 @@ resolve_call (gfc_code * c)
|
||||
if (resolve_actual_arglist (c->ext.actual) == FAILURE)
|
||||
return FAILURE;
|
||||
|
||||
if (c->resolved_sym != NULL)
|
||||
return SUCCESS;
|
||||
t = SUCCESS;
|
||||
if (c->resolved_sym == NULL)
|
||||
switch (procedure_kind (c->symtree->n.sym))
|
||||
{
|
||||
case PTYPE_GENERIC:
|
||||
t = resolve_generic_s (c);
|
||||
break;
|
||||
|
||||
switch (procedure_kind (c->symtree->n.sym))
|
||||
{
|
||||
case PTYPE_GENERIC:
|
||||
t = resolve_generic_s (c);
|
||||
break;
|
||||
case PTYPE_SPECIFIC:
|
||||
t = resolve_specific_s (c);
|
||||
break;
|
||||
|
||||
case PTYPE_SPECIFIC:
|
||||
t = resolve_specific_s (c);
|
||||
break;
|
||||
case PTYPE_UNKNOWN:
|
||||
t = resolve_unknown_s (c);
|
||||
break;
|
||||
|
||||
case PTYPE_UNKNOWN:
|
||||
t = resolve_unknown_s (c);
|
||||
break;
|
||||
|
||||
default:
|
||||
gfc_internal_error ("resolve_subroutine(): bad function type");
|
||||
}
|
||||
default:
|
||||
gfc_internal_error ("resolve_subroutine(): bad function type");
|
||||
}
|
||||
|
||||
if (t == SUCCESS)
|
||||
find_noncopying_intrinsics (c->resolved_sym, c->ext.actual);
|
||||
return t;
|
||||
}
|
||||
|
||||
|
@ -673,6 +673,95 @@ gfc_trans_allocate_temp_array (stmtblock_t * pre, stmtblock_t * post,
|
||||
}
|
||||
|
||||
|
||||
/* Generate code to tranpose array EXPR by creating a new descriptor
|
||||
in which the dimension specifications have been reversed. */
|
||||
|
||||
void
|
||||
gfc_conv_array_transpose (gfc_se * se, gfc_expr * expr)
|
||||
{
|
||||
tree dest, src, dest_index, src_index;
|
||||
gfc_loopinfo *loop;
|
||||
gfc_ss_info *dest_info, *src_info;
|
||||
gfc_ss *dest_ss, *src_ss;
|
||||
gfc_se src_se;
|
||||
int n;
|
||||
|
||||
loop = se->loop;
|
||||
|
||||
src_ss = gfc_walk_expr (expr);
|
||||
dest_ss = se->ss;
|
||||
|
||||
src_info = &src_ss->data.info;
|
||||
dest_info = &dest_ss->data.info;
|
||||
|
||||
/* Get a descriptor for EXPR. */
|
||||
gfc_init_se (&src_se, NULL);
|
||||
gfc_conv_expr_descriptor (&src_se, expr, src_ss);
|
||||
gfc_add_block_to_block (&se->pre, &src_se.pre);
|
||||
gfc_add_block_to_block (&se->post, &src_se.post);
|
||||
src = src_se.expr;
|
||||
|
||||
/* Allocate a new descriptor for the return value. */
|
||||
dest = gfc_create_var (TREE_TYPE (src), "atmp");
|
||||
dest_info->descriptor = dest;
|
||||
se->expr = dest;
|
||||
|
||||
/* Copy across the dtype field. */
|
||||
gfc_add_modify_expr (&se->pre,
|
||||
gfc_conv_descriptor_dtype (dest),
|
||||
gfc_conv_descriptor_dtype (src));
|
||||
|
||||
/* Copy the dimension information, renumbering dimension 1 to 0 and
|
||||
0 to 1. */
|
||||
gcc_assert (dest_info->dimen == 2);
|
||||
gcc_assert (src_info->dimen == 2);
|
||||
for (n = 0; n < 2; n++)
|
||||
{
|
||||
dest_info->delta[n] = integer_zero_node;
|
||||
dest_info->start[n] = integer_zero_node;
|
||||
dest_info->stride[n] = integer_one_node;
|
||||
dest_info->dim[n] = n;
|
||||
|
||||
dest_index = gfc_rank_cst[n];
|
||||
src_index = gfc_rank_cst[1 - n];
|
||||
|
||||
gfc_add_modify_expr (&se->pre,
|
||||
gfc_conv_descriptor_stride (dest, dest_index),
|
||||
gfc_conv_descriptor_stride (src, src_index));
|
||||
|
||||
gfc_add_modify_expr (&se->pre,
|
||||
gfc_conv_descriptor_lbound (dest, dest_index),
|
||||
gfc_conv_descriptor_lbound (src, src_index));
|
||||
|
||||
gfc_add_modify_expr (&se->pre,
|
||||
gfc_conv_descriptor_ubound (dest, dest_index),
|
||||
gfc_conv_descriptor_ubound (src, src_index));
|
||||
|
||||
if (!loop->to[n])
|
||||
{
|
||||
gcc_assert (integer_zerop (loop->from[n]));
|
||||
loop->to[n] = build2 (MINUS_EXPR, gfc_array_index_type,
|
||||
gfc_conv_descriptor_ubound (dest, dest_index),
|
||||
gfc_conv_descriptor_lbound (dest, dest_index));
|
||||
}
|
||||
}
|
||||
|
||||
/* Copy the data pointer. */
|
||||
dest_info->data = gfc_conv_descriptor_data_get (src);
|
||||
gfc_conv_descriptor_data_set (&se->pre, dest, dest_info->data);
|
||||
|
||||
/* Copy the offset. This is not changed by transposition: the top-left
|
||||
element is still at the same offset as before. */
|
||||
dest_info->offset = gfc_conv_descriptor_offset (src);
|
||||
gfc_add_modify_expr (&se->pre,
|
||||
gfc_conv_descriptor_offset (dest),
|
||||
dest_info->offset);
|
||||
|
||||
if (dest_info->dimen > loop->temp_dim)
|
||||
loop->temp_dim = dest_info->dimen;
|
||||
}
|
||||
|
||||
|
||||
/* Return the number of iterations in a loop that starts at START,
|
||||
ends at END, and has step STEP. */
|
||||
|
||||
|
@ -91,6 +91,8 @@ void gfc_conv_tmp_ref (gfc_se *);
|
||||
void gfc_conv_expr_descriptor (gfc_se *, gfc_expr *, gfc_ss *);
|
||||
/* Convert an array for passing as an actual function parameter. */
|
||||
void gfc_conv_array_parameter (gfc_se *, gfc_expr *, gfc_ss *, int);
|
||||
/* Evaluate and transpose a matrix expression. */
|
||||
void gfc_conv_array_transpose (gfc_se *, gfc_expr *);
|
||||
|
||||
/* These work with both descriptors and descriptorless arrays. */
|
||||
tree gfc_conv_array_data (tree);
|
||||
@ -112,8 +114,6 @@ tree gfc_conv_descriptor_ubound (tree, tree);
|
||||
|
||||
/* Dependency checking for WHERE and FORALL. */
|
||||
int gfc_check_dependency (gfc_expr *, gfc_expr *, gfc_expr **, int);
|
||||
/* Dependency checking for function calls. */
|
||||
int gfc_check_fncall_dependency (gfc_expr *, gfc_expr *);
|
||||
|
||||
/* Add pre-loop scalarization code for intrinsic functions which require
|
||||
special handling. */
|
||||
|
@ -2627,7 +2627,9 @@ gfc_trans_arrayfunc_assign (gfc_expr * expr1, gfc_expr * expr2)
|
||||
}
|
||||
|
||||
/* Check for a dependency. */
|
||||
if (gfc_check_fncall_dependency (expr1, expr2))
|
||||
if (gfc_check_fncall_dependency (expr1, INTENT_OUT,
|
||||
expr2->value.function.esym,
|
||||
expr2->value.function.actual))
|
||||
return NULL;
|
||||
|
||||
/* The frontend doesn't seem to bother filling in expr->symtree for intrinsic
|
||||
|
@ -2894,7 +2894,7 @@ gfc_conv_intrinsic_function (gfc_se * se, gfc_expr * expr)
|
||||
|
||||
name = &expr->value.function.name[2];
|
||||
|
||||
if (expr->rank > 0)
|
||||
if (expr->rank > 0 && !expr->inline_noncopying_intrinsic)
|
||||
{
|
||||
lib = gfc_is_intrinsic_libcall (expr);
|
||||
if (lib != 0)
|
||||
@ -3119,6 +3119,16 @@ gfc_conv_intrinsic_function (gfc_se * se, gfc_expr * expr)
|
||||
gfc_conv_intrinsic_bound (se, expr, 0);
|
||||
break;
|
||||
|
||||
case GFC_ISYM_TRANSPOSE:
|
||||
if (se->ss && se->ss->useflags)
|
||||
{
|
||||
gfc_conv_tmp_array_ref (se);
|
||||
gfc_advance_se_ss_chain (se);
|
||||
}
|
||||
else
|
||||
gfc_conv_array_transpose (se, expr->value.function.actual->expr);
|
||||
break;
|
||||
|
||||
case GFC_ISYM_LEN:
|
||||
gfc_conv_intrinsic_len (se, expr);
|
||||
break;
|
||||
|
@ -1,3 +1,13 @@
|
||||
2005-12-13 Richard Sandiford <richard@codesourcery.com>
|
||||
Victor Leikehman <LEI@il.ibm.com>
|
||||
|
||||
* m4/matmul.m4: Use a different order in the special case of a
|
||||
transposed first argument.
|
||||
* generated/matmul_c4.c, generated/matmul_c8.c, generated/matmul_c10.c,
|
||||
* generated/matmul_c16.c, generated/matmul_i4.c, generated/matmul_i8.c,
|
||||
* generated/matmul_i10.c, generated/matmul_r4.c, generated/matmul_r8.c
|
||||
* generated/matmul_r10.c, generated/matmul_r16.c: Regenerated.
|
||||
|
||||
2005-12-10 Janne Blomqvist <jb@gcc.gnu.org>
|
||||
|
||||
* Makefile.am: Enable loop unrolling for matmul.
|
||||
|
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
|
||||
|
||||
#if defined (HAVE_GFC_COMPLEX_10)
|
||||
|
||||
/* This is a C version of the following fortran pseudo-code. The key
|
||||
point is the loop order -- we access all arrays column-first, which
|
||||
improves the performance enough to boost galgel spec score by 50%.
|
||||
/* The order of loops is different in the case of plain matrix
|
||||
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
|
||||
looking at their strides.
|
||||
|
||||
The equivalent Fortran pseudo-code is:
|
||||
|
||||
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
||||
C = 0
|
||||
DO J=1,N
|
||||
DO K=1,COUNT
|
||||
IF (.NOT.IS_TRANSPOSED(A)) THEN
|
||||
C = 0
|
||||
DO J=1,N
|
||||
DO K=1,COUNT
|
||||
DO I=1,M
|
||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||
ELSE
|
||||
DO J=1,N
|
||||
DO I=1,M
|
||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||
S = 0
|
||||
DO K=1,COUNT
|
||||
S = S+A(I,K)+B(K,J)
|
||||
C(I,J) = S
|
||||
ENDIF
|
||||
*/
|
||||
|
||||
extern void matmul_c10 (gfc_array_c10 * const restrict retarray,
|
||||
@ -204,7 +217,28 @@ matmul_c10 (gfc_array_c10 * const restrict retarray,
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||
{
|
||||
const GFC_COMPLEX_10 *restrict abase_x;
|
||||
const GFC_COMPLEX_10 *restrict bbase_y;
|
||||
GFC_COMPLEX_10 *restrict dest_y;
|
||||
GFC_COMPLEX_10 s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = (GFC_COMPLEX_10) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n] * bbase_y[n];
|
||||
dest_y[x] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (axstride < aystride)
|
||||
{
|
||||
for (y = 0; y < ycount; y++)
|
||||
for (x = 0; x < xcount; x++)
|
||||
@ -216,6 +250,27 @@ matmul_c10 (gfc_array_c10 * const restrict retarray,
|
||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
||||
}
|
||||
else
|
||||
{
|
||||
const GFC_COMPLEX_10 *restrict abase_x;
|
||||
const GFC_COMPLEX_10 *restrict bbase_y;
|
||||
GFC_COMPLEX_10 *restrict dest_y;
|
||||
GFC_COMPLEX_10 s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = (GFC_COMPLEX_10) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||
dest_y[x*rxstride] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
|
||||
|
||||
#if defined (HAVE_GFC_COMPLEX_16)
|
||||
|
||||
/* This is a C version of the following fortran pseudo-code. The key
|
||||
point is the loop order -- we access all arrays column-first, which
|
||||
improves the performance enough to boost galgel spec score by 50%.
|
||||
/* The order of loops is different in the case of plain matrix
|
||||
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
|
||||
looking at their strides.
|
||||
|
||||
The equivalent Fortran pseudo-code is:
|
||||
|
||||
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
||||
C = 0
|
||||
DO J=1,N
|
||||
DO K=1,COUNT
|
||||
IF (.NOT.IS_TRANSPOSED(A)) THEN
|
||||
C = 0
|
||||
DO J=1,N
|
||||
DO K=1,COUNT
|
||||
DO I=1,M
|
||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||
ELSE
|
||||
DO J=1,N
|
||||
DO I=1,M
|
||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||
S = 0
|
||||
DO K=1,COUNT
|
||||
S = S+A(I,K)+B(K,J)
|
||||
C(I,J) = S
|
||||
ENDIF
|
||||
*/
|
||||
|
||||
extern void matmul_c16 (gfc_array_c16 * const restrict retarray,
|
||||
@ -204,7 +217,28 @@ matmul_c16 (gfc_array_c16 * const restrict retarray,
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||
{
|
||||
const GFC_COMPLEX_16 *restrict abase_x;
|
||||
const GFC_COMPLEX_16 *restrict bbase_y;
|
||||
GFC_COMPLEX_16 *restrict dest_y;
|
||||
GFC_COMPLEX_16 s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = (GFC_COMPLEX_16) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n] * bbase_y[n];
|
||||
dest_y[x] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (axstride < aystride)
|
||||
{
|
||||
for (y = 0; y < ycount; y++)
|
||||
for (x = 0; x < xcount; x++)
|
||||
@ -216,6 +250,27 @@ matmul_c16 (gfc_array_c16 * const restrict retarray,
|
||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
||||
}
|
||||
else
|
||||
{
|
||||
const GFC_COMPLEX_16 *restrict abase_x;
|
||||
const GFC_COMPLEX_16 *restrict bbase_y;
|
||||
GFC_COMPLEX_16 *restrict dest_y;
|
||||
GFC_COMPLEX_16 s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = (GFC_COMPLEX_16) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||
dest_y[x*rxstride] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
|
||||
|
||||
#if defined (HAVE_GFC_COMPLEX_4)
|
||||
|
||||
/* This is a C version of the following fortran pseudo-code. The key
|
||||
point is the loop order -- we access all arrays column-first, which
|
||||
improves the performance enough to boost galgel spec score by 50%.
|
||||
/* The order of loops is different in the case of plain matrix
|
||||
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
|
||||
looking at their strides.
|
||||
|
||||
The equivalent Fortran pseudo-code is:
|
||||
|
||||
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
||||
C = 0
|
||||
DO J=1,N
|
||||
DO K=1,COUNT
|
||||
IF (.NOT.IS_TRANSPOSED(A)) THEN
|
||||
C = 0
|
||||
DO J=1,N
|
||||
DO K=1,COUNT
|
||||
DO I=1,M
|
||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||
ELSE
|
||||
DO J=1,N
|
||||
DO I=1,M
|
||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||
S = 0
|
||||
DO K=1,COUNT
|
||||
S = S+A(I,K)+B(K,J)
|
||||
C(I,J) = S
|
||||
ENDIF
|
||||
*/
|
||||
|
||||
extern void matmul_c4 (gfc_array_c4 * const restrict retarray,
|
||||
@ -204,7 +217,28 @@ matmul_c4 (gfc_array_c4 * const restrict retarray,
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||
{
|
||||
const GFC_COMPLEX_4 *restrict abase_x;
|
||||
const GFC_COMPLEX_4 *restrict bbase_y;
|
||||
GFC_COMPLEX_4 *restrict dest_y;
|
||||
GFC_COMPLEX_4 s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = (GFC_COMPLEX_4) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n] * bbase_y[n];
|
||||
dest_y[x] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (axstride < aystride)
|
||||
{
|
||||
for (y = 0; y < ycount; y++)
|
||||
for (x = 0; x < xcount; x++)
|
||||
@ -216,6 +250,27 @@ matmul_c4 (gfc_array_c4 * const restrict retarray,
|
||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
||||
}
|
||||
else
|
||||
{
|
||||
const GFC_COMPLEX_4 *restrict abase_x;
|
||||
const GFC_COMPLEX_4 *restrict bbase_y;
|
||||
GFC_COMPLEX_4 *restrict dest_y;
|
||||
GFC_COMPLEX_4 s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = (GFC_COMPLEX_4) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||
dest_y[x*rxstride] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
|
||||
|
||||
#if defined (HAVE_GFC_COMPLEX_8)
|
||||
|
||||
/* This is a C version of the following fortran pseudo-code. The key
|
||||
point is the loop order -- we access all arrays column-first, which
|
||||
improves the performance enough to boost galgel spec score by 50%.
|
||||
/* The order of loops is different in the case of plain matrix
|
||||
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
|
||||
looking at their strides.
|
||||
|
||||
The equivalent Fortran pseudo-code is:
|
||||
|
||||
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
||||
C = 0
|
||||
DO J=1,N
|
||||
DO K=1,COUNT
|
||||
IF (.NOT.IS_TRANSPOSED(A)) THEN
|
||||
C = 0
|
||||
DO J=1,N
|
||||
DO K=1,COUNT
|
||||
DO I=1,M
|
||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||
ELSE
|
||||
DO J=1,N
|
||||
DO I=1,M
|
||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||
S = 0
|
||||
DO K=1,COUNT
|
||||
S = S+A(I,K)+B(K,J)
|
||||
C(I,J) = S
|
||||
ENDIF
|
||||
*/
|
||||
|
||||
extern void matmul_c8 (gfc_array_c8 * const restrict retarray,
|
||||
@ -204,7 +217,28 @@ matmul_c8 (gfc_array_c8 * const restrict retarray,
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||
{
|
||||
const GFC_COMPLEX_8 *restrict abase_x;
|
||||
const GFC_COMPLEX_8 *restrict bbase_y;
|
||||
GFC_COMPLEX_8 *restrict dest_y;
|
||||
GFC_COMPLEX_8 s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = (GFC_COMPLEX_8) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n] * bbase_y[n];
|
||||
dest_y[x] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (axstride < aystride)
|
||||
{
|
||||
for (y = 0; y < ycount; y++)
|
||||
for (x = 0; x < xcount; x++)
|
||||
@ -216,6 +250,27 @@ matmul_c8 (gfc_array_c8 * const restrict retarray,
|
||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
||||
}
|
||||
else
|
||||
{
|
||||
const GFC_COMPLEX_8 *restrict abase_x;
|
||||
const GFC_COMPLEX_8 *restrict bbase_y;
|
||||
GFC_COMPLEX_8 *restrict dest_y;
|
||||
GFC_COMPLEX_8 s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = (GFC_COMPLEX_8) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||
dest_y[x*rxstride] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
|
||||
|
||||
#if defined (HAVE_GFC_INTEGER_16)
|
||||
|
||||
/* This is a C version of the following fortran pseudo-code. The key
|
||||
point is the loop order -- we access all arrays column-first, which
|
||||
improves the performance enough to boost galgel spec score by 50%.
|
||||
/* The order of loops is different in the case of plain matrix
|
||||
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
|
||||
looking at their strides.
|
||||
|
||||
The equivalent Fortran pseudo-code is:
|
||||
|
||||
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
||||
C = 0
|
||||
DO J=1,N
|
||||
DO K=1,COUNT
|
||||
IF (.NOT.IS_TRANSPOSED(A)) THEN
|
||||
C = 0
|
||||
DO J=1,N
|
||||
DO K=1,COUNT
|
||||
DO I=1,M
|
||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||
ELSE
|
||||
DO J=1,N
|
||||
DO I=1,M
|
||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||
S = 0
|
||||
DO K=1,COUNT
|
||||
S = S+A(I,K)+B(K,J)
|
||||
C(I,J) = S
|
||||
ENDIF
|
||||
*/
|
||||
|
||||
extern void matmul_i16 (gfc_array_i16 * const restrict retarray,
|
||||
@ -204,7 +217,28 @@ matmul_i16 (gfc_array_i16 * const restrict retarray,
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||
{
|
||||
const GFC_INTEGER_16 *restrict abase_x;
|
||||
const GFC_INTEGER_16 *restrict bbase_y;
|
||||
GFC_INTEGER_16 *restrict dest_y;
|
||||
GFC_INTEGER_16 s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = (GFC_INTEGER_16) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n] * bbase_y[n];
|
||||
dest_y[x] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (axstride < aystride)
|
||||
{
|
||||
for (y = 0; y < ycount; y++)
|
||||
for (x = 0; x < xcount; x++)
|
||||
@ -216,6 +250,27 @@ matmul_i16 (gfc_array_i16 * const restrict retarray,
|
||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
||||
}
|
||||
else
|
||||
{
|
||||
const GFC_INTEGER_16 *restrict abase_x;
|
||||
const GFC_INTEGER_16 *restrict bbase_y;
|
||||
GFC_INTEGER_16 *restrict dest_y;
|
||||
GFC_INTEGER_16 s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = (GFC_INTEGER_16) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||
dest_y[x*rxstride] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
|
||||
|
||||
#if defined (HAVE_GFC_INTEGER_4)
|
||||
|
||||
/* This is a C version of the following fortran pseudo-code. The key
|
||||
point is the loop order -- we access all arrays column-first, which
|
||||
improves the performance enough to boost galgel spec score by 50%.
|
||||
/* The order of loops is different in the case of plain matrix
|
||||
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
|
||||
looking at their strides.
|
||||
|
||||
The equivalent Fortran pseudo-code is:
|
||||
|
||||
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
||||
C = 0
|
||||
DO J=1,N
|
||||
DO K=1,COUNT
|
||||
IF (.NOT.IS_TRANSPOSED(A)) THEN
|
||||
C = 0
|
||||
DO J=1,N
|
||||
DO K=1,COUNT
|
||||
DO I=1,M
|
||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||
ELSE
|
||||
DO J=1,N
|
||||
DO I=1,M
|
||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||
S = 0
|
||||
DO K=1,COUNT
|
||||
S = S+A(I,K)+B(K,J)
|
||||
C(I,J) = S
|
||||
ENDIF
|
||||
*/
|
||||
|
||||
extern void matmul_i4 (gfc_array_i4 * const restrict retarray,
|
||||
@ -204,7 +217,28 @@ matmul_i4 (gfc_array_i4 * const restrict retarray,
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||
{
|
||||
const GFC_INTEGER_4 *restrict abase_x;
|
||||
const GFC_INTEGER_4 *restrict bbase_y;
|
||||
GFC_INTEGER_4 *restrict dest_y;
|
||||
GFC_INTEGER_4 s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = (GFC_INTEGER_4) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n] * bbase_y[n];
|
||||
dest_y[x] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (axstride < aystride)
|
||||
{
|
||||
for (y = 0; y < ycount; y++)
|
||||
for (x = 0; x < xcount; x++)
|
||||
@ -216,6 +250,27 @@ matmul_i4 (gfc_array_i4 * const restrict retarray,
|
||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
||||
}
|
||||
else
|
||||
{
|
||||
const GFC_INTEGER_4 *restrict abase_x;
|
||||
const GFC_INTEGER_4 *restrict bbase_y;
|
||||
GFC_INTEGER_4 *restrict dest_y;
|
||||
GFC_INTEGER_4 s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = (GFC_INTEGER_4) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||
dest_y[x*rxstride] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
|
||||
|
||||
#if defined (HAVE_GFC_INTEGER_8)
|
||||
|
||||
/* This is a C version of the following fortran pseudo-code. The key
|
||||
point is the loop order -- we access all arrays column-first, which
|
||||
improves the performance enough to boost galgel spec score by 50%.
|
||||
/* The order of loops is different in the case of plain matrix
|
||||
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
|
||||
looking at their strides.
|
||||
|
||||
The equivalent Fortran pseudo-code is:
|
||||
|
||||
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
||||
C = 0
|
||||
DO J=1,N
|
||||
DO K=1,COUNT
|
||||
IF (.NOT.IS_TRANSPOSED(A)) THEN
|
||||
C = 0
|
||||
DO J=1,N
|
||||
DO K=1,COUNT
|
||||
DO I=1,M
|
||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||
ELSE
|
||||
DO J=1,N
|
||||
DO I=1,M
|
||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||
S = 0
|
||||
DO K=1,COUNT
|
||||
S = S+A(I,K)+B(K,J)
|
||||
C(I,J) = S
|
||||
ENDIF
|
||||
*/
|
||||
|
||||
extern void matmul_i8 (gfc_array_i8 * const restrict retarray,
|
||||
@ -204,7 +217,28 @@ matmul_i8 (gfc_array_i8 * const restrict retarray,
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||
{
|
||||
const GFC_INTEGER_8 *restrict abase_x;
|
||||
const GFC_INTEGER_8 *restrict bbase_y;
|
||||
GFC_INTEGER_8 *restrict dest_y;
|
||||
GFC_INTEGER_8 s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = (GFC_INTEGER_8) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n] * bbase_y[n];
|
||||
dest_y[x] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (axstride < aystride)
|
||||
{
|
||||
for (y = 0; y < ycount; y++)
|
||||
for (x = 0; x < xcount; x++)
|
||||
@ -216,6 +250,27 @@ matmul_i8 (gfc_array_i8 * const restrict retarray,
|
||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
||||
}
|
||||
else
|
||||
{
|
||||
const GFC_INTEGER_8 *restrict abase_x;
|
||||
const GFC_INTEGER_8 *restrict bbase_y;
|
||||
GFC_INTEGER_8 *restrict dest_y;
|
||||
GFC_INTEGER_8 s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = (GFC_INTEGER_8) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||
dest_y[x*rxstride] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
|
||||
|
||||
#if defined (HAVE_GFC_REAL_10)
|
||||
|
||||
/* This is a C version of the following fortran pseudo-code. The key
|
||||
point is the loop order -- we access all arrays column-first, which
|
||||
improves the performance enough to boost galgel spec score by 50%.
|
||||
/* The order of loops is different in the case of plain matrix
|
||||
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
|
||||
looking at their strides.
|
||||
|
||||
The equivalent Fortran pseudo-code is:
|
||||
|
||||
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
||||
C = 0
|
||||
DO J=1,N
|
||||
DO K=1,COUNT
|
||||
IF (.NOT.IS_TRANSPOSED(A)) THEN
|
||||
C = 0
|
||||
DO J=1,N
|
||||
DO K=1,COUNT
|
||||
DO I=1,M
|
||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||
ELSE
|
||||
DO J=1,N
|
||||
DO I=1,M
|
||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||
S = 0
|
||||
DO K=1,COUNT
|
||||
S = S+A(I,K)+B(K,J)
|
||||
C(I,J) = S
|
||||
ENDIF
|
||||
*/
|
||||
|
||||
extern void matmul_r10 (gfc_array_r10 * const restrict retarray,
|
||||
@ -204,7 +217,28 @@ matmul_r10 (gfc_array_r10 * const restrict retarray,
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||
{
|
||||
const GFC_REAL_10 *restrict abase_x;
|
||||
const GFC_REAL_10 *restrict bbase_y;
|
||||
GFC_REAL_10 *restrict dest_y;
|
||||
GFC_REAL_10 s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = (GFC_REAL_10) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n] * bbase_y[n];
|
||||
dest_y[x] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (axstride < aystride)
|
||||
{
|
||||
for (y = 0; y < ycount; y++)
|
||||
for (x = 0; x < xcount; x++)
|
||||
@ -216,6 +250,27 @@ matmul_r10 (gfc_array_r10 * const restrict retarray,
|
||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
||||
}
|
||||
else
|
||||
{
|
||||
const GFC_REAL_10 *restrict abase_x;
|
||||
const GFC_REAL_10 *restrict bbase_y;
|
||||
GFC_REAL_10 *restrict dest_y;
|
||||
GFC_REAL_10 s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = (GFC_REAL_10) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||
dest_y[x*rxstride] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
|
||||
|
||||
#if defined (HAVE_GFC_REAL_16)
|
||||
|
||||
/* This is a C version of the following fortran pseudo-code. The key
|
||||
point is the loop order -- we access all arrays column-first, which
|
||||
improves the performance enough to boost galgel spec score by 50%.
|
||||
/* The order of loops is different in the case of plain matrix
|
||||
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
|
||||
looking at their strides.
|
||||
|
||||
The equivalent Fortran pseudo-code is:
|
||||
|
||||
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
||||
C = 0
|
||||
DO J=1,N
|
||||
DO K=1,COUNT
|
||||
IF (.NOT.IS_TRANSPOSED(A)) THEN
|
||||
C = 0
|
||||
DO J=1,N
|
||||
DO K=1,COUNT
|
||||
DO I=1,M
|
||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||
ELSE
|
||||
DO J=1,N
|
||||
DO I=1,M
|
||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||
S = 0
|
||||
DO K=1,COUNT
|
||||
S = S+A(I,K)+B(K,J)
|
||||
C(I,J) = S
|
||||
ENDIF
|
||||
*/
|
||||
|
||||
extern void matmul_r16 (gfc_array_r16 * const restrict retarray,
|
||||
@ -204,7 +217,28 @@ matmul_r16 (gfc_array_r16 * const restrict retarray,
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||
{
|
||||
const GFC_REAL_16 *restrict abase_x;
|
||||
const GFC_REAL_16 *restrict bbase_y;
|
||||
GFC_REAL_16 *restrict dest_y;
|
||||
GFC_REAL_16 s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = (GFC_REAL_16) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n] * bbase_y[n];
|
||||
dest_y[x] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (axstride < aystride)
|
||||
{
|
||||
for (y = 0; y < ycount; y++)
|
||||
for (x = 0; x < xcount; x++)
|
||||
@ -216,6 +250,27 @@ matmul_r16 (gfc_array_r16 * const restrict retarray,
|
||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
||||
}
|
||||
else
|
||||
{
|
||||
const GFC_REAL_16 *restrict abase_x;
|
||||
const GFC_REAL_16 *restrict bbase_y;
|
||||
GFC_REAL_16 *restrict dest_y;
|
||||
GFC_REAL_16 s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = (GFC_REAL_16) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||
dest_y[x*rxstride] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
|
||||
|
||||
#if defined (HAVE_GFC_REAL_4)
|
||||
|
||||
/* This is a C version of the following fortran pseudo-code. The key
|
||||
point is the loop order -- we access all arrays column-first, which
|
||||
improves the performance enough to boost galgel spec score by 50%.
|
||||
/* The order of loops is different in the case of plain matrix
|
||||
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
|
||||
looking at their strides.
|
||||
|
||||
The equivalent Fortran pseudo-code is:
|
||||
|
||||
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
||||
C = 0
|
||||
DO J=1,N
|
||||
DO K=1,COUNT
|
||||
IF (.NOT.IS_TRANSPOSED(A)) THEN
|
||||
C = 0
|
||||
DO J=1,N
|
||||
DO K=1,COUNT
|
||||
DO I=1,M
|
||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||
ELSE
|
||||
DO J=1,N
|
||||
DO I=1,M
|
||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||
S = 0
|
||||
DO K=1,COUNT
|
||||
S = S+A(I,K)+B(K,J)
|
||||
C(I,J) = S
|
||||
ENDIF
|
||||
*/
|
||||
|
||||
extern void matmul_r4 (gfc_array_r4 * const restrict retarray,
|
||||
@ -204,7 +217,28 @@ matmul_r4 (gfc_array_r4 * const restrict retarray,
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||
{
|
||||
const GFC_REAL_4 *restrict abase_x;
|
||||
const GFC_REAL_4 *restrict bbase_y;
|
||||
GFC_REAL_4 *restrict dest_y;
|
||||
GFC_REAL_4 s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = (GFC_REAL_4) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n] * bbase_y[n];
|
||||
dest_y[x] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (axstride < aystride)
|
||||
{
|
||||
for (y = 0; y < ycount; y++)
|
||||
for (x = 0; x < xcount; x++)
|
||||
@ -216,6 +250,27 @@ matmul_r4 (gfc_array_r4 * const restrict retarray,
|
||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
||||
}
|
||||
else
|
||||
{
|
||||
const GFC_REAL_4 *restrict abase_x;
|
||||
const GFC_REAL_4 *restrict bbase_y;
|
||||
GFC_REAL_4 *restrict dest_y;
|
||||
GFC_REAL_4 s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = (GFC_REAL_4) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||
dest_y[x*rxstride] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
|
||||
|
||||
#if defined (HAVE_GFC_REAL_8)
|
||||
|
||||
/* This is a C version of the following fortran pseudo-code. The key
|
||||
point is the loop order -- we access all arrays column-first, which
|
||||
improves the performance enough to boost galgel spec score by 50%.
|
||||
/* The order of loops is different in the case of plain matrix
|
||||
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
|
||||
looking at their strides.
|
||||
|
||||
The equivalent Fortran pseudo-code is:
|
||||
|
||||
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
||||
C = 0
|
||||
DO J=1,N
|
||||
DO K=1,COUNT
|
||||
IF (.NOT.IS_TRANSPOSED(A)) THEN
|
||||
C = 0
|
||||
DO J=1,N
|
||||
DO K=1,COUNT
|
||||
DO I=1,M
|
||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||
ELSE
|
||||
DO J=1,N
|
||||
DO I=1,M
|
||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||
S = 0
|
||||
DO K=1,COUNT
|
||||
S = S+A(I,K)+B(K,J)
|
||||
C(I,J) = S
|
||||
ENDIF
|
||||
*/
|
||||
|
||||
extern void matmul_r8 (gfc_array_r8 * const restrict retarray,
|
||||
@ -204,7 +217,28 @@ matmul_r8 (gfc_array_r8 * const restrict retarray,
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||
{
|
||||
const GFC_REAL_8 *restrict abase_x;
|
||||
const GFC_REAL_8 *restrict bbase_y;
|
||||
GFC_REAL_8 *restrict dest_y;
|
||||
GFC_REAL_8 s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = (GFC_REAL_8) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n] * bbase_y[n];
|
||||
dest_y[x] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (axstride < aystride)
|
||||
{
|
||||
for (y = 0; y < ycount; y++)
|
||||
for (x = 0; x < xcount; x++)
|
||||
@ -216,6 +250,27 @@ matmul_r8 (gfc_array_r8 * const restrict retarray,
|
||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
||||
}
|
||||
else
|
||||
{
|
||||
const GFC_REAL_8 *restrict abase_x;
|
||||
const GFC_REAL_8 *restrict bbase_y;
|
||||
GFC_REAL_8 *restrict dest_y;
|
||||
GFC_REAL_8 s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = (GFC_REAL_8) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||
dest_y[x*rxstride] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -37,16 +37,29 @@ include(iparm.m4)dnl
|
||||
|
||||
`#if defined (HAVE_'rtype_name`)'
|
||||
|
||||
/* This is a C version of the following fortran pseudo-code. The key
|
||||
point is the loop order -- we access all arrays column-first, which
|
||||
improves the performance enough to boost galgel spec score by 50%.
|
||||
/* The order of loops is different in the case of plain matrix
|
||||
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
|
||||
looking at their strides.
|
||||
|
||||
The equivalent Fortran pseudo-code is:
|
||||
|
||||
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
||||
C = 0
|
||||
DO J=1,N
|
||||
DO K=1,COUNT
|
||||
IF (.NOT.IS_TRANSPOSED(A)) THEN
|
||||
C = 0
|
||||
DO J=1,N
|
||||
DO K=1,COUNT
|
||||
DO I=1,M
|
||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||
ELSE
|
||||
DO J=1,N
|
||||
DO I=1,M
|
||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||
S = 0
|
||||
DO K=1,COUNT
|
||||
S = S+A(I,K)+B(K,J)
|
||||
C(I,J) = S
|
||||
ENDIF
|
||||
*/
|
||||
|
||||
extern void matmul_`'rtype_code (rtype * const restrict retarray,
|
||||
@ -206,7 +219,28 @@ sinclude(`matmul_asm_'rtype_code`.m4')dnl
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||
{
|
||||
const rtype_name *restrict abase_x;
|
||||
const rtype_name *restrict bbase_y;
|
||||
rtype_name *restrict dest_y;
|
||||
rtype_name s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = (rtype_name) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n] * bbase_y[n];
|
||||
dest_y[x] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (axstride < aystride)
|
||||
{
|
||||
for (y = 0; y < ycount; y++)
|
||||
for (x = 0; x < xcount; x++)
|
||||
@ -218,6 +252,27 @@ sinclude(`matmul_asm_'rtype_code`.m4')dnl
|
||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
||||
}
|
||||
else
|
||||
{
|
||||
const rtype_name *restrict abase_x;
|
||||
const rtype_name *restrict bbase_y;
|
||||
rtype_name *restrict dest_y;
|
||||
rtype_name s;
|
||||
|
||||
for (y = 0; y < ycount; y++)
|
||||
{
|
||||
bbase_y = &bbase[y*bystride];
|
||||
dest_y = &dest[y*rystride];
|
||||
for (x = 0; x < xcount; x++)
|
||||
{
|
||||
abase_x = &abase[x*axstride];
|
||||
s = (rtype_name) 0;
|
||||
for (n = 0; n < count; n++)
|
||||
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||
dest_y[x*rxstride] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user