summaryrefslogtreecommitdiff
path: root/libgfortran
diff options
context:
space:
mode:
authorThomas Koenig <tkoenig@gcc.gnu.org>2017-05-25 21:51:27 +0000
committerThomas Koenig <tkoenig@gcc.gnu.org>2017-05-25 21:51:27 +0000
commit1d5cf7fcf28c0b7d44fab10d26fc450e9d962f03 (patch)
tree3cd241e469094809d6ceb3aa1a14f6841627e681 /libgfortran
parent87e1e6036ef93b18b1450357488ee907db880f37 (diff)
re PR libfortran/78379 (Processor-specific versions for matmul)
2017-05-25 Thomas Koenig <tkoenig@gcc.gnu.org> PR libfortran/78379 * Makefile.am: Add generated/matmulavx128_*.c files. Handle them for compiling and setting the right flags. * acinclude.m4: Add tests for FMA3, FMA4 and AVX128. * configure.ac: Call them. * Makefile.in: Regenerated. * config.h.in: Regenerated. * configure: Regenerated. * m4/matmul.m4: Handle AMD chips by calling 128-bit AVX versions which use FMA3 or FMA4. * m4/matmulavx128.m4: New file. * generated/matmul_c10.c: Regenerated. * generated/matmul_c16.c: Regenerated. * generated/matmul_c4.c: Regenerated. * generated/matmul_c8.c: Regenerated. * generated/matmul_i1.c: Regenerated. * generated/matmul_i16.c: Regenerated. * generated/matmul_i2.c: Regenerated. * generated/matmul_i4.c: Regenerated. * generated/matmul_i8.c: Regenerated. * generated/matmul_r10.c: Regenerated. * generated/matmul_r16.c: Regenerated. * generated/matmul_r4.c: Regenerated. * generated/matmul_r8.c: Regenerated. * generated/matmulavx128_c10.c: New file. * generated/matmulavx128_c16.c: New file. * generated/matmulavx128_c4.c: New file. * generated/matmulavx128_c8.c: New file. * generated/matmulavx128_i1.c: New file. * generated/matmulavx128_i16.c: New file. * generated/matmulavx128_i2.c: New file. * generated/matmulavx128_i4.c: New file. * generated/matmulavx128_i8.c: New file. * generated/matmulavx128_r10.c: New file. * generated/matmulavx128_r16.c: New file. * generated/matmulavx128_r4.c: New file. * generated/matmulavx128_r8.c: New file. From-SVN: r248472
Diffstat (limited to 'libgfortran')
-rw-r--r--libgfortran/ChangeLog42
-rw-r--r--libgfortran/Makefile.am27
-rw-r--r--libgfortran/Makefile.in174
-rw-r--r--libgfortran/acinclude.m450
-rw-r--r--libgfortran/config.h.in9
-rwxr-xr-xlibgfortran/configure103
-rw-r--r--libgfortran/configure.ac9
-rw-r--r--libgfortran/generated/matmul_c10.c38
-rw-r--r--libgfortran/generated/matmul_c16.c38
-rw-r--r--libgfortran/generated/matmul_c4.c38
-rw-r--r--libgfortran/generated/matmul_c8.c38
-rw-r--r--libgfortran/generated/matmul_i1.c38
-rw-r--r--libgfortran/generated/matmul_i16.c38
-rw-r--r--libgfortran/generated/matmul_i2.c38
-rw-r--r--libgfortran/generated/matmul_i4.c38
-rw-r--r--libgfortran/generated/matmul_i8.c38
-rw-r--r--libgfortran/generated/matmul_r10.c38
-rw-r--r--libgfortran/generated/matmul_r16.c38
-rw-r--r--libgfortran/generated/matmul_r4.c38
-rw-r--r--libgfortran/generated/matmul_r8.c38
-rw-r--r--libgfortran/generated/matmulavx128_c10.c1152
-rw-r--r--libgfortran/generated/matmulavx128_c16.c1152
-rw-r--r--libgfortran/generated/matmulavx128_c4.c1152
-rw-r--r--libgfortran/generated/matmulavx128_c8.c1152
-rw-r--r--libgfortran/generated/matmulavx128_i1.c1152
-rw-r--r--libgfortran/generated/matmulavx128_i16.c1152
-rw-r--r--libgfortran/generated/matmulavx128_i2.c1152
-rw-r--r--libgfortran/generated/matmulavx128_i4.c1152
-rw-r--r--libgfortran/generated/matmulavx128_i8.c1152
-rw-r--r--libgfortran/generated/matmulavx128_r10.c1152
-rw-r--r--libgfortran/generated/matmulavx128_r16.c1152
-rw-r--r--libgfortran/generated/matmulavx128_r4.c1152
-rw-r--r--libgfortran/generated/matmulavx128_r8.c1152
-rw-r--r--libgfortran/m4/matmul.m440
-rw-r--r--libgfortran/m4/matmulavx128.m467
35 files changed, 15964 insertions, 27 deletions
diff --git a/libgfortran/ChangeLog b/libgfortran/ChangeLog
index 4ada8b8074a..3e6c0ad2a61 100644
--- a/libgfortran/ChangeLog
+++ b/libgfortran/ChangeLog
@@ -1,3 +1,43 @@
+2017-05-25 Thomas Koenig <tkoenig@gcc.gnu.org>
+
+ PR libfortran/78379
+ * Makefile.am: Add generated/matmulavx128_*.c files.
+ Handle them for compiling and setting the right flags.
+ * acinclude.m4: Add tests for FMA3, FMA4 and AVX128.
+ * configure.ac: Call them.
+ * Makefile.in: Regenerated.
+ * config.h.in: Regenerated.
+ * configure: Regenerated.
+ * m4/matmul.m4: Handle AMD chips by calling 128-bit AVX
+ versions which use FMA3 or FMA4.
+ * m4/matmulavx128.m4: New file.
+ * generated/matmul_c10.c: Regenerated.
+ * generated/matmul_c16.c: Regenerated.
+ * generated/matmul_c4.c: Regenerated.
+ * generated/matmul_c8.c: Regenerated.
+ * generated/matmul_i1.c: Regenerated.
+ * generated/matmul_i16.c: Regenerated.
+ * generated/matmul_i2.c: Regenerated.
+ * generated/matmul_i4.c: Regenerated.
+ * generated/matmul_i8.c: Regenerated.
+ * generated/matmul_r10.c: Regenerated.
+ * generated/matmul_r16.c: Regenerated.
+ * generated/matmul_r4.c: Regenerated.
+ * generated/matmul_r8.c: Regenerated.
+ * generated/matmulavx128_c10.c: New file.
+ * generated/matmulavx128_c16.c: New file.
+ * generated/matmulavx128_c4.c: New file.
+ * generated/matmulavx128_c8.c: New file.
+ * generated/matmulavx128_i1.c: New file.
+ * generated/matmulavx128_i16.c: New file.
+ * generated/matmulavx128_i2.c: New file.
+ * generated/matmulavx128_i4.c: New file.
+ * generated/matmulavx128_i8.c: New file.
+ * generated/matmulavx128_r10.c: New file.
+ * generated/matmulavx128_r16.c: New file.
+ * generated/matmulavx128_r4.c: New file.
+ * generated/matmulavx128_r8.c: New file.
+
2017-05-19 Paul Thomas <pault@gcc.gnu.org>
Jerry DeLisle <jvdelisle@gcc.gnu.org>
@@ -14,7 +54,7 @@
(st_endfile): Likewise.
(st_rewind): Likewise.
(st_flush): Likewise.
-
+
2017-05-15 Jerry DeLisle <jvdelisle@gcc.gnu.org>
PR libgfortran/80727
diff --git a/libgfortran/Makefile.am b/libgfortran/Makefile.am
index 2857af5943f..90888129303 100644
--- a/libgfortran/Makefile.am
+++ b/libgfortran/Makefile.am
@@ -460,6 +460,21 @@ $(srcdir)/generated/matmul_c8.c \
$(srcdir)/generated/matmul_c10.c \
$(srcdir)/generated/matmul_c16.c
+i_matmulavx128_c= \
+$(srcdir)/generated/matmulavx128_i1.c \
+$(srcdir)/generated/matmulavx128_i2.c \
+$(srcdir)/generated/matmulavx128_i4.c \
+$(srcdir)/generated/matmulavx128_i8.c \
+$(srcdir)/generated/matmulavx128_i16.c \
+$(srcdir)/generated/matmulavx128_r4.c \
+$(srcdir)/generated/matmulavx128_r8.c \
+$(srcdir)/generated/matmulavx128_r10.c \
+$(srcdir)/generated/matmulavx128_r16.c \
+$(srcdir)/generated/matmulavx128_c4.c \
+$(srcdir)/generated/matmulavx128_c8.c \
+$(srcdir)/generated/matmulavx128_c10.c \
+$(srcdir)/generated/matmulavx128_c16.c
+
i_matmull_c= \
$(srcdir)/generated/matmul_l4.c \
$(srcdir)/generated/matmul_l8.c \
@@ -641,7 +656,7 @@ gfor_built_src= $(i_all_c) $(i_any_c) $(i_count_c) $(i_maxloc0_c) \
$(i_iparity_c) $(i_norm2_c) $(i_parity_c) \
$(i_matmul_c) $(i_matmull_c) $(i_shape_c) $(i_eoshift1_c) \
$(i_eoshift3_c) $(i_cshift1_c) $(i_reshape_c) $(in_pack_c) $(in_unpack_c) \
- $(i_pow_c) $(i_pack_c) $(i_unpack_c) \
+ $(i_pow_c) $(i_pack_c) $(i_unpack_c) $(i_matmulavx128_c) \
$(i_spread_c) selected_int_kind.inc selected_real_kind.inc kinds.h \
$(i_cshift0_c) kinds.inc c99_protos.inc fpu-target.h fpu-target.inc
@@ -796,7 +811,12 @@ intrinsics/dprod_r8.f90 \
intrinsics/f2c_specifics.F90
# Turn on vectorization and loop unrolling for matmul.
-$(patsubst %.c,%.lo,$(notdir $(i_matmul_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4
+$(patsubst %.c,%.lo,$(notdir $(i_matmul_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4
+
+if HAVE_AVX128
+# Turn on AVX128 for AMD-specific matmul, but only if the compiler understands -mprefer-avx128
+$(patsubst %.c,%.lo,$(notdir $(i_matmulavx128_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4 -mprefer-avx128
+endif
# Logical matmul doesn't vectorize.
$(patsubst %.c,%.lo,$(notdir $(i_matmull_c))): AM_CFLAGS += -funroll-loops
@@ -936,6 +956,9 @@ $(i_sum_c): m4/sum.m4 $(I_M4_DEPS1)
$(i_matmul_c): m4/matmul.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
$(M4) -Dfile=$@ -I$(srcdir)/m4 matmul.m4 > $@
+$(i_matmulavx128_c): m4/matmulavx128.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
+ $(M4) -Dfile=$@ -I$(srcdir)/m4 matmulavx128.m4 > $@
+
$(i_matmull_c): m4/matmull.m4 $(I_M4_DEPS)
$(M4) -Dfile=$@ -I$(srcdir)/m4 matmull.m4 > $@
diff --git a/libgfortran/Makefile.in b/libgfortran/Makefile.in
index 4914a6f323f..e47d6ebc593 100644
--- a/libgfortran/Makefile.in
+++ b/libgfortran/Makefile.in
@@ -289,15 +289,20 @@ am__objects_32 = unpack_i1.lo unpack_i2.lo unpack_i4.lo unpack_i8.lo \
unpack_i16.lo unpack_r4.lo unpack_r8.lo unpack_r10.lo \
unpack_r16.lo unpack_c4.lo unpack_c8.lo unpack_c10.lo \
unpack_c16.lo
-am__objects_33 = spread_i1.lo spread_i2.lo spread_i4.lo spread_i8.lo \
+am__objects_33 = matmulavx128_i1.lo matmulavx128_i2.lo \
+ matmulavx128_i4.lo matmulavx128_i8.lo matmulavx128_i16.lo \
+ matmulavx128_r4.lo matmulavx128_r8.lo matmulavx128_r10.lo \
+ matmulavx128_r16.lo matmulavx128_c4.lo matmulavx128_c8.lo \
+ matmulavx128_c10.lo matmulavx128_c16.lo
+am__objects_34 = spread_i1.lo spread_i2.lo spread_i4.lo spread_i8.lo \
spread_i16.lo spread_r4.lo spread_r8.lo spread_r10.lo \
spread_r16.lo spread_c4.lo spread_c8.lo spread_c10.lo \
spread_c16.lo
-am__objects_34 = cshift0_i1.lo cshift0_i2.lo cshift0_i4.lo \
+am__objects_35 = cshift0_i1.lo cshift0_i2.lo cshift0_i4.lo \
cshift0_i8.lo cshift0_i16.lo cshift0_r4.lo cshift0_r8.lo \
cshift0_r10.lo cshift0_r16.lo cshift0_c4.lo cshift0_c8.lo \
cshift0_c10.lo cshift0_c16.lo
-am__objects_35 = $(am__objects_4) $(am__objects_5) $(am__objects_6) \
+am__objects_36 = $(am__objects_4) $(am__objects_5) $(am__objects_6) \
$(am__objects_7) $(am__objects_8) $(am__objects_9) \
$(am__objects_10) $(am__objects_11) $(am__objects_12) \
$(am__objects_13) $(am__objects_14) $(am__objects_15) \
@@ -307,14 +312,14 @@ am__objects_35 = $(am__objects_4) $(am__objects_5) $(am__objects_6) \
$(am__objects_25) $(am__objects_26) $(am__objects_27) \
$(am__objects_28) $(am__objects_29) $(am__objects_30) \
$(am__objects_31) $(am__objects_32) $(am__objects_33) \
- $(am__objects_34)
-@LIBGFOR_MINIMAL_FALSE@am__objects_36 = close.lo file_pos.lo format.lo \
+ $(am__objects_34) $(am__objects_35)
+@LIBGFOR_MINIMAL_FALSE@am__objects_37 = close.lo file_pos.lo format.lo \
@LIBGFOR_MINIMAL_FALSE@ inquire.lo intrinsics.lo list_read.lo \
@LIBGFOR_MINIMAL_FALSE@ lock.lo open.lo read.lo transfer.lo \
@LIBGFOR_MINIMAL_FALSE@ transfer128.lo unit.lo unix.lo write.lo \
@LIBGFOR_MINIMAL_FALSE@ fbuf.lo
-am__objects_37 = size_from_kind.lo $(am__objects_36)
-@LIBGFOR_MINIMAL_FALSE@am__objects_38 = access.lo c99_functions.lo \
+am__objects_38 = size_from_kind.lo $(am__objects_37)
+@LIBGFOR_MINIMAL_FALSE@am__objects_39 = access.lo c99_functions.lo \
@LIBGFOR_MINIMAL_FALSE@ chdir.lo chmod.lo clock.lo cpu_time.lo \
@LIBGFOR_MINIMAL_FALSE@ ctime.lo date_and_time.lo dtime.lo \
@LIBGFOR_MINIMAL_FALSE@ env.lo etime.lo execute_command_line.lo \
@@ -324,19 +329,19 @@ am__objects_37 = size_from_kind.lo $(am__objects_36)
@LIBGFOR_MINIMAL_FALSE@ rename.lo stat.lo symlnk.lo \
@LIBGFOR_MINIMAL_FALSE@ system_clock.lo time.lo umask.lo \
@LIBGFOR_MINIMAL_FALSE@ unlink.lo
-@IEEE_SUPPORT_TRUE@am__objects_39 = ieee_helper.lo
-am__objects_40 = associated.lo abort.lo args.lo cshift0.lo eoshift0.lo \
+@IEEE_SUPPORT_TRUE@am__objects_40 = ieee_helper.lo
+am__objects_41 = associated.lo abort.lo args.lo cshift0.lo eoshift0.lo \
eoshift2.lo erfc_scaled.lo extends_type_of.lo fnum.lo \
ierrno.lo ishftc.lo mvbits.lo move_alloc.lo pack_generic.lo \
selected_char_kind.lo size.lo spread_generic.lo \
string_intrinsics.lo rand.lo random.lo reshape_generic.lo \
reshape_packed.lo selected_int_kind.lo selected_real_kind.lo \
unpack_generic.lo in_pack_generic.lo in_unpack_generic.lo \
- $(am__objects_38) $(am__objects_39)
-@IEEE_SUPPORT_TRUE@am__objects_41 = ieee_arithmetic.lo \
+ $(am__objects_39) $(am__objects_40)
+@IEEE_SUPPORT_TRUE@am__objects_42 = ieee_arithmetic.lo \
@IEEE_SUPPORT_TRUE@ ieee_exceptions.lo ieee_features.lo
-am__objects_42 =
-am__objects_43 = _abs_c4.lo _abs_c8.lo _abs_c10.lo _abs_c16.lo \
+am__objects_43 =
+am__objects_44 = _abs_c4.lo _abs_c8.lo _abs_c10.lo _abs_c16.lo \
_abs_i4.lo _abs_i8.lo _abs_i16.lo _abs_r4.lo _abs_r8.lo \
_abs_r10.lo _abs_r16.lo _aimag_c4.lo _aimag_c8.lo \
_aimag_c10.lo _aimag_c16.lo _exp_r4.lo _exp_r8.lo _exp_r10.lo \
@@ -360,19 +365,19 @@ am__objects_43 = _abs_c4.lo _abs_c8.lo _abs_c10.lo _abs_c16.lo \
_conjg_c4.lo _conjg_c8.lo _conjg_c10.lo _conjg_c16.lo \
_aint_r4.lo _aint_r8.lo _aint_r10.lo _aint_r16.lo _anint_r4.lo \
_anint_r8.lo _anint_r10.lo _anint_r16.lo
-am__objects_44 = _sign_i4.lo _sign_i8.lo _sign_i16.lo _sign_r4.lo \
+am__objects_45 = _sign_i4.lo _sign_i8.lo _sign_i16.lo _sign_r4.lo \
_sign_r8.lo _sign_r10.lo _sign_r16.lo _dim_i4.lo _dim_i8.lo \
_dim_i16.lo _dim_r4.lo _dim_r8.lo _dim_r10.lo _dim_r16.lo \
_atan2_r4.lo _atan2_r8.lo _atan2_r10.lo _atan2_r16.lo \
_mod_i4.lo _mod_i8.lo _mod_i16.lo _mod_r4.lo _mod_r8.lo \
_mod_r10.lo _mod_r16.lo
-am__objects_45 = misc_specifics.lo
-am__objects_46 = $(am__objects_43) $(am__objects_44) $(am__objects_45) \
+am__objects_46 = misc_specifics.lo
+am__objects_47 = $(am__objects_44) $(am__objects_45) $(am__objects_46) \
dprod_r8.lo f2c_specifics.lo
-am__objects_47 = $(am__objects_3) $(am__objects_35) $(am__objects_37) \
- $(am__objects_40) $(am__objects_41) $(am__objects_42) \
- $(am__objects_46)
-@onestep_FALSE@am_libgfortran_la_OBJECTS = $(am__objects_47)
+am__objects_48 = $(am__objects_3) $(am__objects_36) $(am__objects_38) \
+ $(am__objects_41) $(am__objects_42) $(am__objects_43) \
+ $(am__objects_47)
+@onestep_FALSE@am_libgfortran_la_OBJECTS = $(am__objects_48)
@onestep_TRUE@am_libgfortran_la_OBJECTS = libgfortran_c.lo
libgfortran_la_OBJECTS = $(am_libgfortran_la_OBJECTS)
DEFAULT_INCLUDES = -I.@am__isrc@
@@ -879,6 +884,21 @@ $(srcdir)/generated/matmul_c8.c \
$(srcdir)/generated/matmul_c10.c \
$(srcdir)/generated/matmul_c16.c
+i_matmulavx128_c = \
+$(srcdir)/generated/matmulavx128_i1.c \
+$(srcdir)/generated/matmulavx128_i2.c \
+$(srcdir)/generated/matmulavx128_i4.c \
+$(srcdir)/generated/matmulavx128_i8.c \
+$(srcdir)/generated/matmulavx128_i16.c \
+$(srcdir)/generated/matmulavx128_r4.c \
+$(srcdir)/generated/matmulavx128_r8.c \
+$(srcdir)/generated/matmulavx128_r10.c \
+$(srcdir)/generated/matmulavx128_r16.c \
+$(srcdir)/generated/matmulavx128_c4.c \
+$(srcdir)/generated/matmulavx128_c8.c \
+$(srcdir)/generated/matmulavx128_c10.c \
+$(srcdir)/generated/matmulavx128_c16.c
+
i_matmull_c = \
$(srcdir)/generated/matmul_l4.c \
$(srcdir)/generated/matmul_l8.c \
@@ -1059,7 +1079,7 @@ gfor_built_src = $(i_all_c) $(i_any_c) $(i_count_c) $(i_maxloc0_c) \
$(i_iparity_c) $(i_norm2_c) $(i_parity_c) \
$(i_matmul_c) $(i_matmull_c) $(i_shape_c) $(i_eoshift1_c) \
$(i_eoshift3_c) $(i_cshift1_c) $(i_reshape_c) $(in_pack_c) $(in_unpack_c) \
- $(i_pow_c) $(i_pack_c) $(i_unpack_c) \
+ $(i_pow_c) $(i_pack_c) $(i_unpack_c) $(i_matmulavx128_c) \
$(i_spread_c) selected_int_kind.inc selected_real_kind.inc kinds.h \
$(i_cshift0_c) kinds.inc c99_protos.inc fpu-target.h fpu-target.inc
@@ -1518,6 +1538,19 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmul_r16.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmul_r4.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmul_r8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_c10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_c16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_c4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_c8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_i1.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_i16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_i2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_i4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_i8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_r10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_r16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_r4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_r8.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/maxloc0_16_i1.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/maxloc0_16_i16.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/maxloc0_16_i2.Plo@am__quote@
@@ -4584,6 +4617,97 @@ unpack_c16.lo: $(srcdir)/generated/unpack_c16.c
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o unpack_c16.lo `test -f '$(srcdir)/generated/unpack_c16.c' || echo '$(srcdir)/'`$(srcdir)/generated/unpack_c16.c
+matmulavx128_i1.lo: $(srcdir)/generated/matmulavx128_i1.c
+@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_i1.lo -MD -MP -MF $(DEPDIR)/matmulavx128_i1.Tpo -c -o matmulavx128_i1.lo `test -f '$(srcdir)/generated/matmulavx128_i1.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i1.c
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_i1.Tpo $(DEPDIR)/matmulavx128_i1.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_i1.c' object='matmulavx128_i1.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_i1.lo `test -f '$(srcdir)/generated/matmulavx128_i1.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i1.c
+
+matmulavx128_i2.lo: $(srcdir)/generated/matmulavx128_i2.c
+@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_i2.lo -MD -MP -MF $(DEPDIR)/matmulavx128_i2.Tpo -c -o matmulavx128_i2.lo `test -f '$(srcdir)/generated/matmulavx128_i2.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i2.c
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_i2.Tpo $(DEPDIR)/matmulavx128_i2.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_i2.c' object='matmulavx128_i2.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_i2.lo `test -f '$(srcdir)/generated/matmulavx128_i2.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i2.c
+
+matmulavx128_i4.lo: $(srcdir)/generated/matmulavx128_i4.c
+@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_i4.lo -MD -MP -MF $(DEPDIR)/matmulavx128_i4.Tpo -c -o matmulavx128_i4.lo `test -f '$(srcdir)/generated/matmulavx128_i4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i4.c
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_i4.Tpo $(DEPDIR)/matmulavx128_i4.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_i4.c' object='matmulavx128_i4.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_i4.lo `test -f '$(srcdir)/generated/matmulavx128_i4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i4.c
+
+matmulavx128_i8.lo: $(srcdir)/generated/matmulavx128_i8.c
+@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_i8.lo -MD -MP -MF $(DEPDIR)/matmulavx128_i8.Tpo -c -o matmulavx128_i8.lo `test -f '$(srcdir)/generated/matmulavx128_i8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i8.c
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_i8.Tpo $(DEPDIR)/matmulavx128_i8.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_i8.c' object='matmulavx128_i8.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_i8.lo `test -f '$(srcdir)/generated/matmulavx128_i8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i8.c
+
+matmulavx128_i16.lo: $(srcdir)/generated/matmulavx128_i16.c
+@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_i16.lo -MD -MP -MF $(DEPDIR)/matmulavx128_i16.Tpo -c -o matmulavx128_i16.lo `test -f '$(srcdir)/generated/matmulavx128_i16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i16.c
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_i16.Tpo $(DEPDIR)/matmulavx128_i16.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_i16.c' object='matmulavx128_i16.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_i16.lo `test -f '$(srcdir)/generated/matmulavx128_i16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i16.c
+
+matmulavx128_r4.lo: $(srcdir)/generated/matmulavx128_r4.c
+@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_r4.lo -MD -MP -MF $(DEPDIR)/matmulavx128_r4.Tpo -c -o matmulavx128_r4.lo `test -f '$(srcdir)/generated/matmulavx128_r4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r4.c
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_r4.Tpo $(DEPDIR)/matmulavx128_r4.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_r4.c' object='matmulavx128_r4.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_r4.lo `test -f '$(srcdir)/generated/matmulavx128_r4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r4.c
+
+matmulavx128_r8.lo: $(srcdir)/generated/matmulavx128_r8.c
+@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_r8.lo -MD -MP -MF $(DEPDIR)/matmulavx128_r8.Tpo -c -o matmulavx128_r8.lo `test -f '$(srcdir)/generated/matmulavx128_r8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r8.c
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_r8.Tpo $(DEPDIR)/matmulavx128_r8.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_r8.c' object='matmulavx128_r8.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_r8.lo `test -f '$(srcdir)/generated/matmulavx128_r8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r8.c
+
+matmulavx128_r10.lo: $(srcdir)/generated/matmulavx128_r10.c
+@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_r10.lo -MD -MP -MF $(DEPDIR)/matmulavx128_r10.Tpo -c -o matmulavx128_r10.lo `test -f '$(srcdir)/generated/matmulavx128_r10.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r10.c
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_r10.Tpo $(DEPDIR)/matmulavx128_r10.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_r10.c' object='matmulavx128_r10.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_r10.lo `test -f '$(srcdir)/generated/matmulavx128_r10.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r10.c
+
+matmulavx128_r16.lo: $(srcdir)/generated/matmulavx128_r16.c
+@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_r16.lo -MD -MP -MF $(DEPDIR)/matmulavx128_r16.Tpo -c -o matmulavx128_r16.lo `test -f '$(srcdir)/generated/matmulavx128_r16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r16.c
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_r16.Tpo $(DEPDIR)/matmulavx128_r16.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_r16.c' object='matmulavx128_r16.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_r16.lo `test -f '$(srcdir)/generated/matmulavx128_r16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r16.c
+
+matmulavx128_c4.lo: $(srcdir)/generated/matmulavx128_c4.c
+@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_c4.lo -MD -MP -MF $(DEPDIR)/matmulavx128_c4.Tpo -c -o matmulavx128_c4.lo `test -f '$(srcdir)/generated/matmulavx128_c4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c4.c
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_c4.Tpo $(DEPDIR)/matmulavx128_c4.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_c4.c' object='matmulavx128_c4.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_c4.lo `test -f '$(srcdir)/generated/matmulavx128_c4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c4.c
+
+matmulavx128_c8.lo: $(srcdir)/generated/matmulavx128_c8.c
+@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_c8.lo -MD -MP -MF $(DEPDIR)/matmulavx128_c8.Tpo -c -o matmulavx128_c8.lo `test -f '$(srcdir)/generated/matmulavx128_c8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c8.c
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_c8.Tpo $(DEPDIR)/matmulavx128_c8.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_c8.c' object='matmulavx128_c8.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_c8.lo `test -f '$(srcdir)/generated/matmulavx128_c8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c8.c
+
+matmulavx128_c10.lo: $(srcdir)/generated/matmulavx128_c10.c
+@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_c10.lo -MD -MP -MF $(DEPDIR)/matmulavx128_c10.Tpo -c -o matmulavx128_c10.lo `test -f '$(srcdir)/generated/matmulavx128_c10.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c10.c
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_c10.Tpo $(DEPDIR)/matmulavx128_c10.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_c10.c' object='matmulavx128_c10.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_c10.lo `test -f '$(srcdir)/generated/matmulavx128_c10.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c10.c
+
+matmulavx128_c16.lo: $(srcdir)/generated/matmulavx128_c16.c
+@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_c16.lo -MD -MP -MF $(DEPDIR)/matmulavx128_c16.Tpo -c -o matmulavx128_c16.lo `test -f '$(srcdir)/generated/matmulavx128_c16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c16.c
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_c16.Tpo $(DEPDIR)/matmulavx128_c16.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_c16.c' object='matmulavx128_c16.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_c16.lo `test -f '$(srcdir)/generated/matmulavx128_c16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c16.c
+
spread_i1.lo: $(srcdir)/generated/spread_i1.c
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT spread_i1.lo -MD -MP -MF $(DEPDIR)/spread_i1.Tpo -c -o spread_i1.lo `test -f '$(srcdir)/generated/spread_i1.c' || echo '$(srcdir)/'`$(srcdir)/generated/spread_i1.c
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/spread_i1.Tpo $(DEPDIR)/spread_i1.Plo
@@ -5567,7 +5691,10 @@ uninstall-am: uninstall-cafexeclibLTLIBRARIES \
@LIBGFOR_USE_SYMVER_SUN_TRUE@@LIBGFOR_USE_SYMVER_TRUE@ > $@ || (rm -f $@ ; exit 1)
# Turn on vectorization and loop unrolling for matmul.
-$(patsubst %.c,%.lo,$(notdir $(i_matmul_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4
+$(patsubst %.c,%.lo,$(notdir $(i_matmul_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4
+
+# Turn on AVX128 for AMD-specific matmul, but only if the compiler understands -mprefer-avx128
+@HAVE_AVX128_TRUE@$(patsubst %.c,%.lo,$(notdir $(i_matmulavx128_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4 -mprefer-avx128
# Logical matmul doesn't vectorize.
$(patsubst %.c,%.lo,$(notdir $(i_matmull_c))): AM_CFLAGS += -funroll-loops
@@ -5667,6 +5794,9 @@ fpu-target.inc: fpu-target.h $(srcdir)/libgfortran.h
@MAINTAINER_MODE_TRUE@$(i_matmul_c): m4/matmul.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
@MAINTAINER_MODE_TRUE@ $(M4) -Dfile=$@ -I$(srcdir)/m4 matmul.m4 > $@
+@MAINTAINER_MODE_TRUE@$(i_matmulavx128_c): m4/matmulavx128.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
+@MAINTAINER_MODE_TRUE@ $(M4) -Dfile=$@ -I$(srcdir)/m4 matmulavx128.m4 > $@
+
@MAINTAINER_MODE_TRUE@$(i_matmull_c): m4/matmull.m4 $(I_M4_DEPS)
@MAINTAINER_MODE_TRUE@ $(M4) -Dfile=$@ -I$(srcdir)/m4 matmull.m4 > $@
diff --git a/libgfortran/acinclude.m4 b/libgfortran/acinclude.m4
index ceb80a12a18..cd8dfabffe9 100644
--- a/libgfortran/acinclude.m4
+++ b/libgfortran/acinclude.m4
@@ -452,3 +452,53 @@ AC_DEFUN([LIBGFOR_CHECK_AVX512F], [
[])
CFLAGS="$ac_save_CFLAGS"
])
+
+dnl Check for FMA3
+dnl
+AC_DEFUN([LIBGFOR_CHECK_FMA3], [
+ ac_save_CFLAGS="$CFLAGS"
+ CFLAGS="-O2 -mfma -mno-fma4"
+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+ float
+ flt_mul_add (float a, float b, float c)
+ {
+ return __builtin_fmaf (a, b, c);
+ }]], [[]])],
+ AC_DEFINE(HAVE_FMA3, 1,
+ [Define if FMA3 instructions can be compiled.]),
+ [])
+ CFLAGS="$ac_save_CFLAGS"
+])
+
+dnl Check for FMA4
+dnl
+AC_DEFUN([LIBGFOR_CHECK_FMA4], [
+ ac_save_CFLAGS="$CFLAGS"
+ CFLAGS="-O2 -mfma4 -mno-fma"
+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+ float
+ flt_mul_add (float a, float b, float c)
+ {
+ return __builtin_fmaf (a, b, c);
+ }]], [[]])],
+ AC_DEFINE(HAVE_FMA4, 1,
+ [Define if FMA4 instructions can be compiled.]),
+ [])
+ CFLAGS="$ac_save_CFLAGS"
+])
+
+dnl Check for -mprefer-avx128
+dnl This also defines an automake conditional.
+AC_DEFUN([LIBGFOR_CHECK_AVX128], [
+ ac_save_CFLAGS="$CFLAGS"
+ CFLAGS="-O2 -mavx -mprefer-avx128"
+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+ void foo()
+ {
+ }]], [[]])],
+ AC_DEFINE(HAVE_AVX128, 1,
+ [Define if -mprefer-avx128 is supported.])
+ AM_CONDITIONAL([HAVE_AVX128],true),
+ [])
+ CFLAGS="$ac_save_CFLAGS"
+])
diff --git a/libgfortran/config.h.in b/libgfortran/config.h.in
index b762d0990b6..0274e5d9c4c 100644
--- a/libgfortran/config.h.in
+++ b/libgfortran/config.h.in
@@ -81,6 +81,9 @@
/* Define if AVX instructions can be compiled. */
#undef HAVE_AVX
+/* Define if -mprefer-avx128 is supported. */
+#undef HAVE_AVX128
+
/* Define if AVX2 instructions can be compiled. */
#undef HAVE_AVX2
@@ -375,6 +378,12 @@
/* Define to 1 if you have the `floorl' function. */
#undef HAVE_FLOORL
+/* Define if FMA3 instructions can be compiled. */
+#undef HAVE_FMA3
+
+/* Define if FMA4 instructions can be compiled. */
+#undef HAVE_FMA4
+
/* Define to 1 if you have the `fmod' function. */
#undef HAVE_FMOD
diff --git a/libgfortran/configure b/libgfortran/configure
index 81238fcb79c..36e015594ba 100755
--- a/libgfortran/configure
+++ b/libgfortran/configure
@@ -606,6 +606,8 @@ am__EXEEXT_TRUE
LTLIBOBJS
LIBOBJS
get_gcc_base_ver
+HAVE_AVX128_FALSE
+HAVE_AVX128_TRUE
IEEE_FLAGS
IEEE_SUPPORT
IEEE_SUPPORT_FALSE
@@ -12421,7 +12423,7 @@ else
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
lt_status=$lt_dlunknown
cat > conftest.$ac_ext <<_LT_EOF
-#line 12424 "configure"
+#line 12426 "configure"
#include "confdefs.h"
#if HAVE_DLFCN_H
@@ -12527,7 +12529,7 @@ else
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
lt_status=$lt_dlunknown
cat > conftest.$ac_ext <<_LT_EOF
-#line 12530 "configure"
+#line 12532 "configure"
#include "confdefs.h"
#if HAVE_DLFCN_H
@@ -26363,6 +26365,99 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
CFLAGS="$ac_save_CFLAGS"
+# Check for FMA3 extensions
+
+ ac_save_CFLAGS="$CFLAGS"
+ CFLAGS="-O2 -mfma -mno-fma4"
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+ float
+ flt_mul_add (float a, float b, float c)
+ {
+ return __builtin_fmaf (a, b, c);
+ }
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define HAVE_FMA3 1" >>confdefs.h
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ CFLAGS="$ac_save_CFLAGS"
+
+
+# Check for FMA4 extensions
+
+ ac_save_CFLAGS="$CFLAGS"
+ CFLAGS="-O2 -mfma4 -mno-fma"
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+ float
+ flt_mul_add (float a, float b, float c)
+ {
+ return __builtin_fmaf (a, b, c);
+ }
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define HAVE_FMA4 1" >>confdefs.h
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ CFLAGS="$ac_save_CFLAGS"
+
+
+# Check if AVX128 works
+
+ ac_save_CFLAGS="$CFLAGS"
+ CFLAGS="-O2 -mavx -mprefer-avx128"
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+ void foo()
+ {
+ }
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define HAVE_AVX128 1" >>confdefs.h
+
+ if true; then
+ HAVE_AVX128_TRUE=
+ HAVE_AVX128_FALSE='#'
+else
+ HAVE_AVX128_TRUE='#'
+ HAVE_AVX128_FALSE=
+fi
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ CFLAGS="$ac_save_CFLAGS"
+
+
# Determine what GCC version number to use in filesystem paths.
get_gcc_base_ver="cat"
@@ -26615,6 +26710,10 @@ if test -z "${IEEE_SUPPORT_TRUE}" && test -z "${IEEE_SUPPORT_FALSE}"; then
as_fn_error "conditional \"IEEE_SUPPORT\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
+if test -z "${HAVE_AVX128_TRUE}" && test -z "${HAVE_AVX128_FALSE}"; then
+ as_fn_error "conditional \"HAVE_AVX128\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
: ${CONFIG_STATUS=./config.status}
ac_write_fail=0
diff --git a/libgfortran/configure.ac b/libgfortran/configure.ac
index 37b12d2998f..78214ac13c8 100644
--- a/libgfortran/configure.ac
+++ b/libgfortran/configure.ac
@@ -624,6 +624,15 @@ LIBGFOR_CHECK_AVX2
# Check wether we support AVX512f extensions
LIBGFOR_CHECK_AVX512F
+# Check for FMA3 extensions
+LIBGFOR_CHECK_FMA3
+
+# Check for FMA4 extensions
+LIBGFOR_CHECK_FMA4
+
+# Check if AVX128 works
+LIBGFOR_CHECK_AVX128
+
# Determine what GCC version number to use in filesystem paths.
GCC_BASE_VER
diff --git a/libgfortran/generated/matmul_c10.c b/libgfortran/generated/matmul_c10.c
index dbe3d3a82f1..54e2714668d 100644
--- a/libgfortran/generated/matmul_c10.c
+++ b/libgfortran/generated/matmul_c10.c
@@ -1734,6 +1734,24 @@ matmul_c10_avx512f (gfc_array_c10 * const restrict retarray,
#endif /* HAVE_AVX512F */
+/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_c10_avx128_fma3 (gfc_array_c10 * const restrict retarray,
+ gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_c10_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_c10_avx128_fma4 (gfc_array_c10 * const restrict retarray,
+ gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_c10_avx128_fma4);
+#endif
+
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_c10_vanilla (gfc_array_c10 * const restrict retarray,
@@ -2332,6 +2350,26 @@ void matmul_c10 (gfc_array_c10 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
+ else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+ {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+ {
+ matmul_fn = matmul_c10_avx128_fma3;
+ goto store;
+ }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+ {
+ matmul_fn = matmul_c10_avx128_fma4;
+ goto store;
+ }
+#endif
+
+ }
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
diff --git a/libgfortran/generated/matmul_c16.c b/libgfortran/generated/matmul_c16.c
index 36c8747f895..cd8aacd9c68 100644
--- a/libgfortran/generated/matmul_c16.c
+++ b/libgfortran/generated/matmul_c16.c
@@ -1734,6 +1734,24 @@ matmul_c16_avx512f (gfc_array_c16 * const restrict retarray,
#endif /* HAVE_AVX512F */
+/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_c16_avx128_fma3 (gfc_array_c16 * const restrict retarray,
+ gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_c16_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_c16_avx128_fma4 (gfc_array_c16 * const restrict retarray,
+ gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_c16_avx128_fma4);
+#endif
+
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_c16_vanilla (gfc_array_c16 * const restrict retarray,
@@ -2332,6 +2350,26 @@ void matmul_c16 (gfc_array_c16 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
+ else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+ {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+ {
+ matmul_fn = matmul_c16_avx128_fma3;
+ goto store;
+ }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+ {
+ matmul_fn = matmul_c16_avx128_fma4;
+ goto store;
+ }
+#endif
+
+ }
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
diff --git a/libgfortran/generated/matmul_c4.c b/libgfortran/generated/matmul_c4.c
index 79343dde750..ead22efabb2 100644
--- a/libgfortran/generated/matmul_c4.c
+++ b/libgfortran/generated/matmul_c4.c
@@ -1734,6 +1734,24 @@ matmul_c4_avx512f (gfc_array_c4 * const restrict retarray,
#endif /* HAVE_AVX512F */
+/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_c4_avx128_fma3 (gfc_array_c4 * const restrict retarray,
+ gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_c4_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_c4_avx128_fma4 (gfc_array_c4 * const restrict retarray,
+ gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_c4_avx128_fma4);
+#endif
+
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_c4_vanilla (gfc_array_c4 * const restrict retarray,
@@ -2332,6 +2350,26 @@ void matmul_c4 (gfc_array_c4 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
+ else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+ {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+ {
+ matmul_fn = matmul_c4_avx128_fma3;
+ goto store;
+ }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+ {
+ matmul_fn = matmul_c4_avx128_fma4;
+ goto store;
+ }
+#endif
+
+ }
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
diff --git a/libgfortran/generated/matmul_c8.c b/libgfortran/generated/matmul_c8.c
index 0307d3ff559..a52e4bd7f0a 100644
--- a/libgfortran/generated/matmul_c8.c
+++ b/libgfortran/generated/matmul_c8.c
@@ -1734,6 +1734,24 @@ matmul_c8_avx512f (gfc_array_c8 * const restrict retarray,
#endif /* HAVE_AVX512F */
+/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_c8_avx128_fma3 (gfc_array_c8 * const restrict retarray,
+ gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_c8_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_c8_avx128_fma4 (gfc_array_c8 * const restrict retarray,
+ gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_c8_avx128_fma4);
+#endif
+
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_c8_vanilla (gfc_array_c8 * const restrict retarray,
@@ -2332,6 +2350,26 @@ void matmul_c8 (gfc_array_c8 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
+ else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+ {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+ {
+ matmul_fn = matmul_c8_avx128_fma3;
+ goto store;
+ }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+ {
+ matmul_fn = matmul_c8_avx128_fma4;
+ goto store;
+ }
+#endif
+
+ }
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
diff --git a/libgfortran/generated/matmul_i1.c b/libgfortran/generated/matmul_i1.c
index 8cda1c322ae..dfd47e176e4 100644
--- a/libgfortran/generated/matmul_i1.c
+++ b/libgfortran/generated/matmul_i1.c
@@ -1734,6 +1734,24 @@ matmul_i1_avx512f (gfc_array_i1 * const restrict retarray,
#endif /* HAVE_AVX512F */
+/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_i1_avx128_fma3 (gfc_array_i1 * const restrict retarray,
+ gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_i1_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_i1_avx128_fma4 (gfc_array_i1 * const restrict retarray,
+ gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_i1_avx128_fma4);
+#endif
+
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_i1_vanilla (gfc_array_i1 * const restrict retarray,
@@ -2332,6 +2350,26 @@ void matmul_i1 (gfc_array_i1 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
+ else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+ {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+ {
+ matmul_fn = matmul_i1_avx128_fma3;
+ goto store;
+ }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+ {
+ matmul_fn = matmul_i1_avx128_fma4;
+ goto store;
+ }
+#endif
+
+ }
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
diff --git a/libgfortran/generated/matmul_i16.c b/libgfortran/generated/matmul_i16.c
index 06035225d99..a7bdcb5c67d 100644
--- a/libgfortran/generated/matmul_i16.c
+++ b/libgfortran/generated/matmul_i16.c
@@ -1734,6 +1734,24 @@ matmul_i16_avx512f (gfc_array_i16 * const restrict retarray,
#endif /* HAVE_AVX512F */
+/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_i16_avx128_fma3 (gfc_array_i16 * const restrict retarray,
+ gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_i16_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_i16_avx128_fma4 (gfc_array_i16 * const restrict retarray,
+ gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_i16_avx128_fma4);
+#endif
+
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_i16_vanilla (gfc_array_i16 * const restrict retarray,
@@ -2332,6 +2350,26 @@ void matmul_i16 (gfc_array_i16 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
+ else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+ {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+ {
+ matmul_fn = matmul_i16_avx128_fma3;
+ goto store;
+ }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+ {
+ matmul_fn = matmul_i16_avx128_fma4;
+ goto store;
+ }
+#endif
+
+ }
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
diff --git a/libgfortran/generated/matmul_i2.c b/libgfortran/generated/matmul_i2.c
index 93316cb34ad..d541fa3fe73 100644
--- a/libgfortran/generated/matmul_i2.c
+++ b/libgfortran/generated/matmul_i2.c
@@ -1734,6 +1734,24 @@ matmul_i2_avx512f (gfc_array_i2 * const restrict retarray,
#endif /* HAVE_AVX512F */
+/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_i2_avx128_fma3 (gfc_array_i2 * const restrict retarray,
+ gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_i2_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_i2_avx128_fma4 (gfc_array_i2 * const restrict retarray,
+ gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_i2_avx128_fma4);
+#endif
+
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_i2_vanilla (gfc_array_i2 * const restrict retarray,
@@ -2332,6 +2350,26 @@ void matmul_i2 (gfc_array_i2 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
+ else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+ {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+ {
+ matmul_fn = matmul_i2_avx128_fma3;
+ goto store;
+ }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+ {
+ matmul_fn = matmul_i2_avx128_fma4;
+ goto store;
+ }
+#endif
+
+ }
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
diff --git a/libgfortran/generated/matmul_i4.c b/libgfortran/generated/matmul_i4.c
index 475b214ae79..f8f0cdb96cc 100644
--- a/libgfortran/generated/matmul_i4.c
+++ b/libgfortran/generated/matmul_i4.c
@@ -1734,6 +1734,24 @@ matmul_i4_avx512f (gfc_array_i4 * const restrict retarray,
#endif /* HAVE_AVX512F */
+/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_i4_avx128_fma3 (gfc_array_i4 * const restrict retarray,
+ gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_i4_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_i4_avx128_fma4 (gfc_array_i4 * const restrict retarray,
+ gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_i4_avx128_fma4);
+#endif
+
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_i4_vanilla (gfc_array_i4 * const restrict retarray,
@@ -2332,6 +2350,26 @@ void matmul_i4 (gfc_array_i4 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
+ else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+ {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+ {
+ matmul_fn = matmul_i4_avx128_fma3;
+ goto store;
+ }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+ {
+ matmul_fn = matmul_i4_avx128_fma4;
+ goto store;
+ }
+#endif
+
+ }
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
diff --git a/libgfortran/generated/matmul_i8.c b/libgfortran/generated/matmul_i8.c
index d29a4b48c06..2aac1d76822 100644
--- a/libgfortran/generated/matmul_i8.c
+++ b/libgfortran/generated/matmul_i8.c
@@ -1734,6 +1734,24 @@ matmul_i8_avx512f (gfc_array_i8 * const restrict retarray,
#endif /* HAVE_AVX512F */
+/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_i8_avx128_fma3 (gfc_array_i8 * const restrict retarray,
+ gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_i8_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_i8_avx128_fma4 (gfc_array_i8 * const restrict retarray,
+ gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_i8_avx128_fma4);
+#endif
+
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_i8_vanilla (gfc_array_i8 * const restrict retarray,
@@ -2332,6 +2350,26 @@ void matmul_i8 (gfc_array_i8 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
+ else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+ {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+ {
+ matmul_fn = matmul_i8_avx128_fma3;
+ goto store;
+ }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+ {
+ matmul_fn = matmul_i8_avx128_fma4;
+ goto store;
+ }
+#endif
+
+ }
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
diff --git a/libgfortran/generated/matmul_r10.c b/libgfortran/generated/matmul_r10.c
index f1885a434c2..448c96bb99f 100644
--- a/libgfortran/generated/matmul_r10.c
+++ b/libgfortran/generated/matmul_r10.c
@@ -1734,6 +1734,24 @@ matmul_r10_avx512f (gfc_array_r10 * const restrict retarray,
#endif /* HAVE_AVX512F */
+/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_r10_avx128_fma3 (gfc_array_r10 * const restrict retarray,
+ gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_r10_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_r10_avx128_fma4 (gfc_array_r10 * const restrict retarray,
+ gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_r10_avx128_fma4);
+#endif
+
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_r10_vanilla (gfc_array_r10 * const restrict retarray,
@@ -2332,6 +2350,26 @@ void matmul_r10 (gfc_array_r10 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
+ else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+ {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+ {
+ matmul_fn = matmul_r10_avx128_fma3;
+ goto store;
+ }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+ {
+ matmul_fn = matmul_r10_avx128_fma4;
+ goto store;
+ }
+#endif
+
+ }
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
diff --git a/libgfortran/generated/matmul_r16.c b/libgfortran/generated/matmul_r16.c
index a7aa44ff1b2..57a47dcd846 100644
--- a/libgfortran/generated/matmul_r16.c
+++ b/libgfortran/generated/matmul_r16.c
@@ -1734,6 +1734,24 @@ matmul_r16_avx512f (gfc_array_r16 * const restrict retarray,
#endif /* HAVE_AVX512F */
+/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_r16_avx128_fma3 (gfc_array_r16 * const restrict retarray,
+ gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_r16_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_r16_avx128_fma4 (gfc_array_r16 * const restrict retarray,
+ gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_r16_avx128_fma4);
+#endif
+
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_r16_vanilla (gfc_array_r16 * const restrict retarray,
@@ -2332,6 +2350,26 @@ void matmul_r16 (gfc_array_r16 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
+ else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+ {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+ {
+ matmul_fn = matmul_r16_avx128_fma3;
+ goto store;
+ }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+ {
+ matmul_fn = matmul_r16_avx128_fma4;
+ goto store;
+ }
+#endif
+
+ }
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
diff --git a/libgfortran/generated/matmul_r4.c b/libgfortran/generated/matmul_r4.c
index 13530252032..52eea532a9b 100644
--- a/libgfortran/generated/matmul_r4.c
+++ b/libgfortran/generated/matmul_r4.c
@@ -1734,6 +1734,24 @@ matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
#endif /* HAVE_AVX512F */
+/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_r4_avx128_fma3 (gfc_array_r4 * const restrict retarray,
+ gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_r4_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_r4_avx128_fma4 (gfc_array_r4 * const restrict retarray,
+ gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_r4_avx128_fma4);
+#endif
+
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_r4_vanilla (gfc_array_r4 * const restrict retarray,
@@ -2332,6 +2350,26 @@ void matmul_r4 (gfc_array_r4 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
+ else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+ {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+ {
+ matmul_fn = matmul_r4_avx128_fma3;
+ goto store;
+ }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+ {
+ matmul_fn = matmul_r4_avx128_fma4;
+ goto store;
+ }
+#endif
+
+ }
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
diff --git a/libgfortran/generated/matmul_r8.c b/libgfortran/generated/matmul_r8.c
index a4af0008518..074697dbf8b 100644
--- a/libgfortran/generated/matmul_r8.c
+++ b/libgfortran/generated/matmul_r8.c
@@ -1734,6 +1734,24 @@ matmul_r8_avx512f (gfc_array_r8 * const restrict retarray,
#endif /* HAVE_AVX512F */
+/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_r8_avx128_fma3 (gfc_array_r8 * const restrict retarray,
+ gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_r8_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_r8_avx128_fma4 (gfc_array_r8 * const restrict retarray,
+ gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_r8_avx128_fma4);
+#endif
+
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_r8_vanilla (gfc_array_r8 * const restrict retarray,
@@ -2332,6 +2350,26 @@ void matmul_r8 (gfc_array_r8 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
+ else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+ {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+ {
+ matmul_fn = matmul_r8_avx128_fma3;
+ goto store;
+ }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+ {
+ matmul_fn = matmul_r8_avx128_fma4;
+ goto store;
+ }
+#endif
+
+ }
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
diff --git a/libgfortran/generated/matmulavx128_c10.c b/libgfortran/generated/matmulavx128_c10.c
new file mode 100644
index 00000000000..53cdecbd4e8
--- /dev/null
+++ b/libgfortran/generated/matmulavx128_c10.c
@@ -0,0 +1,1152 @@
+/* Implementation of the MATMUL intrinsic
+ Copyright (C) 2002-2017 Free Software Foundation, Inc.
+ Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+#include "libgfortran.h"
+#include <string.h>
+#include <assert.h>
+
+
+/* These are the specific versions of matmul with -mprefer-avx128. */
+
+#if defined (HAVE_GFC_COMPLEX_10)
+
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+ passed to us by the front-end, in which case we call it for large
+ matrices. */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+ const int *, const GFC_COMPLEX_10 *, const GFC_COMPLEX_10 *,
+ const int *, const GFC_COMPLEX_10 *, const int *,
+ const GFC_COMPLEX_10 *, GFC_COMPLEX_10 *, const int *,
+ int, int);
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_c10_avx128_fma3 (gfc_array_c10 * const restrict retarray,
+ gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_c10_avx128_fma3);
+void
+matmul_c10_avx128_fma3 (gfc_array_c10 * const restrict retarray,
+ gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_COMPLEX_10 * restrict abase;
+ const GFC_COMPLEX_10 * restrict bbase;
+ GFC_COMPLEX_10 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_10));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_COMPLEX_10 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_COMPLEX_10 *a, *b;
+ GFC_COMPLEX_10 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_COMPLEX_10 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_COMPLEX_10 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_COMPLEX_10 *restrict abase_x;
+ const GFC_COMPLEX_10 *restrict bbase_y;
+ GFC_COMPLEX_10 *restrict dest_y;
+ GFC_COMPLEX_10 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_10) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_COMPLEX_10 *restrict bbase_y;
+ GFC_COMPLEX_10 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_COMPLEX_10) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_COMPLEX_10)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_COMPLEX_10 *restrict bbase_y;
+ GFC_COMPLEX_10 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_COMPLEX_10) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_COMPLEX_10 *restrict abase_x;
+ const GFC_COMPLEX_10 *restrict bbase_y;
+ GFC_COMPLEX_10 *restrict dest_y;
+ GFC_COMPLEX_10 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_10) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_c10_avx128_fma4 (gfc_array_c10 * const restrict retarray,
+ gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_c10_avx128_fma4);
+void
+matmul_c10_avx128_fma4 (gfc_array_c10 * const restrict retarray,
+ gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_COMPLEX_10 * restrict abase;
+ const GFC_COMPLEX_10 * restrict bbase;
+ GFC_COMPLEX_10 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_10));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_COMPLEX_10 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_COMPLEX_10 *a, *b;
+ GFC_COMPLEX_10 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_COMPLEX_10 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_COMPLEX_10 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_COMPLEX_10 *restrict abase_x;
+ const GFC_COMPLEX_10 *restrict bbase_y;
+ GFC_COMPLEX_10 *restrict dest_y;
+ GFC_COMPLEX_10 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_10) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_COMPLEX_10 *restrict bbase_y;
+ GFC_COMPLEX_10 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_COMPLEX_10) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_COMPLEX_10)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_COMPLEX_10 *restrict bbase_y;
+ GFC_COMPLEX_10 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_COMPLEX_10) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_COMPLEX_10 *restrict abase_x;
+ const GFC_COMPLEX_10 *restrict bbase_y;
+ GFC_COMPLEX_10 *restrict dest_y;
+ GFC_COMPLEX_10 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_10) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#endif
+
diff --git a/libgfortran/generated/matmulavx128_c16.c b/libgfortran/generated/matmulavx128_c16.c
new file mode 100644
index 00000000000..e7657a098a5
--- /dev/null
+++ b/libgfortran/generated/matmulavx128_c16.c
@@ -0,0 +1,1152 @@
+/* Implementation of the MATMUL intrinsic
+ Copyright (C) 2002-2017 Free Software Foundation, Inc.
+ Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+#include "libgfortran.h"
+#include <string.h>
+#include <assert.h>
+
+
+/* These are the specific versions of matmul with -mprefer-avx128. */
+
+#if defined (HAVE_GFC_COMPLEX_16)
+
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+ passed to us by the front-end, in which case we call it for large
+ matrices. */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+ const int *, const GFC_COMPLEX_16 *, const GFC_COMPLEX_16 *,
+ const int *, const GFC_COMPLEX_16 *, const int *,
+ const GFC_COMPLEX_16 *, GFC_COMPLEX_16 *, const int *,
+ int, int);
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_c16_avx128_fma3 (gfc_array_c16 * const restrict retarray,
+ gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_c16_avx128_fma3);
+void
+matmul_c16_avx128_fma3 (gfc_array_c16 * const restrict retarray,
+ gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_COMPLEX_16 * restrict abase;
+ const GFC_COMPLEX_16 * restrict bbase;
+ GFC_COMPLEX_16 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_16));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_COMPLEX_16 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_COMPLEX_16 *a, *b;
+ GFC_COMPLEX_16 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_COMPLEX_16 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_COMPLEX_16 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_16));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_COMPLEX_16)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_COMPLEX_16 *restrict abase_x;
+ const GFC_COMPLEX_16 *restrict bbase_y;
+ GFC_COMPLEX_16 *restrict dest_y;
+ GFC_COMPLEX_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_COMPLEX_16 *restrict bbase_y;
+ GFC_COMPLEX_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_COMPLEX_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_COMPLEX_16)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_COMPLEX_16 *restrict bbase_y;
+ GFC_COMPLEX_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_COMPLEX_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_COMPLEX_16 *restrict abase_x;
+ const GFC_COMPLEX_16 *restrict bbase_y;
+ GFC_COMPLEX_16 *restrict dest_y;
+ GFC_COMPLEX_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_c16_avx128_fma4 (gfc_array_c16 * const restrict retarray,
+ gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_c16_avx128_fma4);
+void
+matmul_c16_avx128_fma4 (gfc_array_c16 * const restrict retarray,
+ gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_COMPLEX_16 * restrict abase;
+ const GFC_COMPLEX_16 * restrict bbase;
+ GFC_COMPLEX_16 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_16));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_COMPLEX_16 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_COMPLEX_16 *a, *b;
+ GFC_COMPLEX_16 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_COMPLEX_16 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_COMPLEX_16 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_16));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_COMPLEX_16)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_COMPLEX_16 *restrict abase_x;
+ const GFC_COMPLEX_16 *restrict bbase_y;
+ GFC_COMPLEX_16 *restrict dest_y;
+ GFC_COMPLEX_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_COMPLEX_16 *restrict bbase_y;
+ GFC_COMPLEX_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_COMPLEX_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_COMPLEX_16)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_COMPLEX_16 *restrict bbase_y;
+ GFC_COMPLEX_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_COMPLEX_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_COMPLEX_16 *restrict abase_x;
+ const GFC_COMPLEX_16 *restrict bbase_y;
+ GFC_COMPLEX_16 *restrict dest_y;
+ GFC_COMPLEX_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#endif
+
diff --git a/libgfortran/generated/matmulavx128_c4.c b/libgfortran/generated/matmulavx128_c4.c
new file mode 100644
index 00000000000..950f1eb49de
--- /dev/null
+++ b/libgfortran/generated/matmulavx128_c4.c
@@ -0,0 +1,1152 @@
+/* Implementation of the MATMUL intrinsic
+ Copyright (C) 2002-2017 Free Software Foundation, Inc.
+ Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+#include "libgfortran.h"
+#include <string.h>
+#include <assert.h>
+
+
+/* These are the specific versions of matmul with -mprefer-avx128. */
+
+#if defined (HAVE_GFC_COMPLEX_4)
+
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+ passed to us by the front-end, in which case we call it for large
+ matrices. */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+ const int *, const GFC_COMPLEX_4 *, const GFC_COMPLEX_4 *,
+ const int *, const GFC_COMPLEX_4 *, const int *,
+ const GFC_COMPLEX_4 *, GFC_COMPLEX_4 *, const int *,
+ int, int);
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_c4_avx128_fma3 (gfc_array_c4 * const restrict retarray,
+ gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_c4_avx128_fma3);
+void
+matmul_c4_avx128_fma3 (gfc_array_c4 * const restrict retarray,
+ gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_COMPLEX_4 * restrict abase;
+ const GFC_COMPLEX_4 * restrict bbase;
+ GFC_COMPLEX_4 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_4));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_COMPLEX_4 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_COMPLEX_4 *a, *b;
+ GFC_COMPLEX_4 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_COMPLEX_4 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_COMPLEX_4 *restrict abase_x;
+ const GFC_COMPLEX_4 *restrict bbase_y;
+ GFC_COMPLEX_4 *restrict dest_y;
+ GFC_COMPLEX_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_COMPLEX_4 *restrict bbase_y;
+ GFC_COMPLEX_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_COMPLEX_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_COMPLEX_4 *restrict bbase_y;
+ GFC_COMPLEX_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_COMPLEX_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_COMPLEX_4 *restrict abase_x;
+ const GFC_COMPLEX_4 *restrict bbase_y;
+ GFC_COMPLEX_4 *restrict dest_y;
+ GFC_COMPLEX_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_c4_avx128_fma4 (gfc_array_c4 * const restrict retarray,
+ gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_c4_avx128_fma4);
+void
+matmul_c4_avx128_fma4 (gfc_array_c4 * const restrict retarray,
+ gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_COMPLEX_4 * restrict abase;
+ const GFC_COMPLEX_4 * restrict bbase;
+ GFC_COMPLEX_4 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_4));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_COMPLEX_4 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_COMPLEX_4 *a, *b;
+ GFC_COMPLEX_4 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_COMPLEX_4 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_COMPLEX_4 *restrict abase_x;
+ const GFC_COMPLEX_4 *restrict bbase_y;
+ GFC_COMPLEX_4 *restrict dest_y;
+ GFC_COMPLEX_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_COMPLEX_4 *restrict bbase_y;
+ GFC_COMPLEX_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_COMPLEX_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_COMPLEX_4 *restrict bbase_y;
+ GFC_COMPLEX_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_COMPLEX_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_COMPLEX_4 *restrict abase_x;
+ const GFC_COMPLEX_4 *restrict bbase_y;
+ GFC_COMPLEX_4 *restrict dest_y;
+ GFC_COMPLEX_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#endif
+
diff --git a/libgfortran/generated/matmulavx128_c8.c b/libgfortran/generated/matmulavx128_c8.c
new file mode 100644
index 00000000000..a41c160c993
--- /dev/null
+++ b/libgfortran/generated/matmulavx128_c8.c
@@ -0,0 +1,1152 @@
+/* Implementation of the MATMUL intrinsic
+ Copyright (C) 2002-2017 Free Software Foundation, Inc.
+ Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+#include "libgfortran.h"
+#include <string.h>
+#include <assert.h>
+
+
+/* These are the specific versions of matmul with -mprefer-avx128. */
+
+#if defined (HAVE_GFC_COMPLEX_8)
+
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+ passed to us by the front-end, in which case we call it for large
+ matrices. */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+ const int *, const GFC_COMPLEX_8 *, const GFC_COMPLEX_8 *,
+ const int *, const GFC_COMPLEX_8 *, const int *,
+ const GFC_COMPLEX_8 *, GFC_COMPLEX_8 *, const int *,
+ int, int);
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_c8_avx128_fma3 (gfc_array_c8 * const restrict retarray,
+ gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_c8_avx128_fma3);
+void
+matmul_c8_avx128_fma3 (gfc_array_c8 * const restrict retarray,
+ gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_COMPLEX_8 * restrict abase;
+ const GFC_COMPLEX_8 * restrict bbase;
+ GFC_COMPLEX_8 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_COMPLEX_8 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_COMPLEX_8 *a, *b;
+ GFC_COMPLEX_8 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_COMPLEX_8 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_COMPLEX_8 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_8));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_COMPLEX_8 *restrict abase_x;
+ const GFC_COMPLEX_8 *restrict bbase_y;
+ GFC_COMPLEX_8 *restrict dest_y;
+ GFC_COMPLEX_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_COMPLEX_8 *restrict bbase_y;
+ GFC_COMPLEX_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_COMPLEX_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_COMPLEX_8 *restrict bbase_y;
+ GFC_COMPLEX_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_COMPLEX_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_COMPLEX_8 *restrict abase_x;
+ const GFC_COMPLEX_8 *restrict bbase_y;
+ GFC_COMPLEX_8 *restrict dest_y;
+ GFC_COMPLEX_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_c8_avx128_fma4 (gfc_array_c8 * const restrict retarray,
+ gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_c8_avx128_fma4);
+void
+matmul_c8_avx128_fma4 (gfc_array_c8 * const restrict retarray,
+ gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_COMPLEX_8 * restrict abase;
+ const GFC_COMPLEX_8 * restrict bbase;
+ GFC_COMPLEX_8 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_COMPLEX_8 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_COMPLEX_8 *a, *b;
+ GFC_COMPLEX_8 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_COMPLEX_8 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_COMPLEX_8 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_8));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_COMPLEX_8 *restrict abase_x;
+ const GFC_COMPLEX_8 *restrict bbase_y;
+ GFC_COMPLEX_8 *restrict dest_y;
+ GFC_COMPLEX_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_COMPLEX_8 *restrict bbase_y;
+ GFC_COMPLEX_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_COMPLEX_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_COMPLEX_8 *restrict bbase_y;
+ GFC_COMPLEX_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_COMPLEX_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_COMPLEX_8 *restrict abase_x;
+ const GFC_COMPLEX_8 *restrict bbase_y;
+ GFC_COMPLEX_8 *restrict dest_y;
+ GFC_COMPLEX_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#endif
+
diff --git a/libgfortran/generated/matmulavx128_i1.c b/libgfortran/generated/matmulavx128_i1.c
new file mode 100644
index 00000000000..e1871578ea2
--- /dev/null
+++ b/libgfortran/generated/matmulavx128_i1.c
@@ -0,0 +1,1152 @@
+/* Implementation of the MATMUL intrinsic
+ Copyright (C) 2002-2017 Free Software Foundation, Inc.
+ Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+#include "libgfortran.h"
+#include <string.h>
+#include <assert.h>
+
+
+/* These are the specific versions of matmul with -mprefer-avx128. */
+
+#if defined (HAVE_GFC_INTEGER_1)
+
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+ passed to us by the front-end, in which case we call it for large
+ matrices. */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+ const int *, const GFC_INTEGER_1 *, const GFC_INTEGER_1 *,
+ const int *, const GFC_INTEGER_1 *, const int *,
+ const GFC_INTEGER_1 *, GFC_INTEGER_1 *, const int *,
+ int, int);
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_i1_avx128_fma3 (gfc_array_i1 * const restrict retarray,
+ gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_i1_avx128_fma3);
+void
+matmul_i1_avx128_fma3 (gfc_array_i1 * const restrict retarray,
+ gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_INTEGER_1 * restrict abase;
+ const GFC_INTEGER_1 * restrict bbase;
+ GFC_INTEGER_1 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_INTEGER_1 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_INTEGER_1 *a, *b;
+ GFC_INTEGER_1 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_INTEGER_1 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_INTEGER_1)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_INTEGER_1 *restrict abase_x;
+ const GFC_INTEGER_1 *restrict bbase_y;
+ GFC_INTEGER_1 *restrict dest_y;
+ GFC_INTEGER_1 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_1) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_INTEGER_1 *restrict bbase_y;
+ GFC_INTEGER_1 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_INTEGER_1) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_INTEGER_1 *restrict bbase_y;
+ GFC_INTEGER_1 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_INTEGER_1) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_INTEGER_1 *restrict abase_x;
+ const GFC_INTEGER_1 *restrict bbase_y;
+ GFC_INTEGER_1 *restrict dest_y;
+ GFC_INTEGER_1 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_1) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_i1_avx128_fma4 (gfc_array_i1 * const restrict retarray,
+ gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_i1_avx128_fma4);
+void
+matmul_i1_avx128_fma4 (gfc_array_i1 * const restrict retarray,
+ gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_INTEGER_1 * restrict abase;
+ const GFC_INTEGER_1 * restrict bbase;
+ GFC_INTEGER_1 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_INTEGER_1 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_INTEGER_1 *a, *b;
+ GFC_INTEGER_1 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_INTEGER_1 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_INTEGER_1)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_INTEGER_1 *restrict abase_x;
+ const GFC_INTEGER_1 *restrict bbase_y;
+ GFC_INTEGER_1 *restrict dest_y;
+ GFC_INTEGER_1 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_1) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_INTEGER_1 *restrict bbase_y;
+ GFC_INTEGER_1 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_INTEGER_1) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_INTEGER_1 *restrict bbase_y;
+ GFC_INTEGER_1 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_INTEGER_1) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_INTEGER_1 *restrict abase_x;
+ const GFC_INTEGER_1 *restrict bbase_y;
+ GFC_INTEGER_1 *restrict dest_y;
+ GFC_INTEGER_1 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_1) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#endif
+
diff --git a/libgfortran/generated/matmulavx128_i16.c b/libgfortran/generated/matmulavx128_i16.c
new file mode 100644
index 00000000000..1a7b733181a
--- /dev/null
+++ b/libgfortran/generated/matmulavx128_i16.c
@@ -0,0 +1,1152 @@
+/* Implementation of the MATMUL intrinsic
+ Copyright (C) 2002-2017 Free Software Foundation, Inc.
+ Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+#include "libgfortran.h"
+#include <string.h>
+#include <assert.h>
+
+
+/* These are the specific versions of matmul with -mprefer-avx128. */
+
+#if defined (HAVE_GFC_INTEGER_16)
+
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+ passed to us by the front-end, in which case we call it for large
+ matrices. */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+ const int *, const GFC_INTEGER_16 *, const GFC_INTEGER_16 *,
+ const int *, const GFC_INTEGER_16 *, const int *,
+ const GFC_INTEGER_16 *, GFC_INTEGER_16 *, const int *,
+ int, int);
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_i16_avx128_fma3 (gfc_array_i16 * const restrict retarray,
+ gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_i16_avx128_fma3);
+void
+matmul_i16_avx128_fma3 (gfc_array_i16 * const restrict retarray,
+ gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_INTEGER_16 * restrict abase;
+ const GFC_INTEGER_16 * restrict bbase;
+ GFC_INTEGER_16 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_INTEGER_16 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_INTEGER_16 *a, *b;
+ GFC_INTEGER_16 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_INTEGER_16 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_INTEGER_16)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_INTEGER_16 *restrict abase_x;
+ const GFC_INTEGER_16 *restrict bbase_y;
+ GFC_INTEGER_16 *restrict dest_y;
+ GFC_INTEGER_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_INTEGER_16 *restrict bbase_y;
+ GFC_INTEGER_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_INTEGER_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_INTEGER_16 *restrict bbase_y;
+ GFC_INTEGER_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_INTEGER_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_INTEGER_16 *restrict abase_x;
+ const GFC_INTEGER_16 *restrict bbase_y;
+ GFC_INTEGER_16 *restrict dest_y;
+ GFC_INTEGER_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_i16_avx128_fma4 (gfc_array_i16 * const restrict retarray,
+ gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_i16_avx128_fma4);
+void
+matmul_i16_avx128_fma4 (gfc_array_i16 * const restrict retarray,
+ gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_INTEGER_16 * restrict abase;
+ const GFC_INTEGER_16 * restrict bbase;
+ GFC_INTEGER_16 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_INTEGER_16 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_INTEGER_16 *a, *b;
+ GFC_INTEGER_16 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_INTEGER_16 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_INTEGER_16)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_INTEGER_16 *restrict abase_x;
+ const GFC_INTEGER_16 *restrict bbase_y;
+ GFC_INTEGER_16 *restrict dest_y;
+ GFC_INTEGER_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_INTEGER_16 *restrict bbase_y;
+ GFC_INTEGER_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_INTEGER_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_INTEGER_16 *restrict bbase_y;
+ GFC_INTEGER_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_INTEGER_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_INTEGER_16 *restrict abase_x;
+ const GFC_INTEGER_16 *restrict bbase_y;
+ GFC_INTEGER_16 *restrict dest_y;
+ GFC_INTEGER_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#endif
+
diff --git a/libgfortran/generated/matmulavx128_i2.c b/libgfortran/generated/matmulavx128_i2.c
new file mode 100644
index 00000000000..a095c5872eb
--- /dev/null
+++ b/libgfortran/generated/matmulavx128_i2.c
@@ -0,0 +1,1152 @@
+/* Implementation of the MATMUL intrinsic
+ Copyright (C) 2002-2017 Free Software Foundation, Inc.
+ Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+#include "libgfortran.h"
+#include <string.h>
+#include <assert.h>
+
+
+/* These are the specific versions of matmul with -mprefer-avx128. */
+
+#if defined (HAVE_GFC_INTEGER_2)
+
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+ passed to us by the front-end, in which case we call it for large
+ matrices. */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+ const int *, const GFC_INTEGER_2 *, const GFC_INTEGER_2 *,
+ const int *, const GFC_INTEGER_2 *, const int *,
+ const GFC_INTEGER_2 *, GFC_INTEGER_2 *, const int *,
+ int, int);
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_i2_avx128_fma3 (gfc_array_i2 * const restrict retarray,
+ gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_i2_avx128_fma3);
+void
+matmul_i2_avx128_fma3 (gfc_array_i2 * const restrict retarray,
+ gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_INTEGER_2 * restrict abase;
+ const GFC_INTEGER_2 * restrict bbase;
+ GFC_INTEGER_2 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_INTEGER_2 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_INTEGER_2 *a, *b;
+ GFC_INTEGER_2 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_INTEGER_2 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_INTEGER_2)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_INTEGER_2 *restrict abase_x;
+ const GFC_INTEGER_2 *restrict bbase_y;
+ GFC_INTEGER_2 *restrict dest_y;
+ GFC_INTEGER_2 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_2) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_INTEGER_2 *restrict bbase_y;
+ GFC_INTEGER_2 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_INTEGER_2) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_INTEGER_2 *restrict bbase_y;
+ GFC_INTEGER_2 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_INTEGER_2) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_INTEGER_2 *restrict abase_x;
+ const GFC_INTEGER_2 *restrict bbase_y;
+ GFC_INTEGER_2 *restrict dest_y;
+ GFC_INTEGER_2 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_2) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_i2_avx128_fma4 (gfc_array_i2 * const restrict retarray,
+ gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_i2_avx128_fma4);
+void
+matmul_i2_avx128_fma4 (gfc_array_i2 * const restrict retarray,
+ gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_INTEGER_2 * restrict abase;
+ const GFC_INTEGER_2 * restrict bbase;
+ GFC_INTEGER_2 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_INTEGER_2 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_INTEGER_2 *a, *b;
+ GFC_INTEGER_2 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_INTEGER_2 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_INTEGER_2)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_INTEGER_2 *restrict abase_x;
+ const GFC_INTEGER_2 *restrict bbase_y;
+ GFC_INTEGER_2 *restrict dest_y;
+ GFC_INTEGER_2 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_2) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_INTEGER_2 *restrict bbase_y;
+ GFC_INTEGER_2 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_INTEGER_2) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_INTEGER_2 *restrict bbase_y;
+ GFC_INTEGER_2 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_INTEGER_2) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_INTEGER_2 *restrict abase_x;
+ const GFC_INTEGER_2 *restrict bbase_y;
+ GFC_INTEGER_2 *restrict dest_y;
+ GFC_INTEGER_2 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_2) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#endif
+
diff --git a/libgfortran/generated/matmulavx128_i4.c b/libgfortran/generated/matmulavx128_i4.c
new file mode 100644
index 00000000000..a01c56f7138
--- /dev/null
+++ b/libgfortran/generated/matmulavx128_i4.c
@@ -0,0 +1,1152 @@
+/* Implementation of the MATMUL intrinsic
+ Copyright (C) 2002-2017 Free Software Foundation, Inc.
+ Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+#include "libgfortran.h"
+#include <string.h>
+#include <assert.h>
+
+
+/* These are the specific versions of matmul with -mprefer-avx128. */
+
+#if defined (HAVE_GFC_INTEGER_4)
+
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+ passed to us by the front-end, in which case we call it for large
+ matrices. */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+ const int *, const GFC_INTEGER_4 *, const GFC_INTEGER_4 *,
+ const int *, const GFC_INTEGER_4 *, const int *,
+ const GFC_INTEGER_4 *, GFC_INTEGER_4 *, const int *,
+ int, int);
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_i4_avx128_fma3 (gfc_array_i4 * const restrict retarray,
+ gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_i4_avx128_fma3);
+void
+matmul_i4_avx128_fma3 (gfc_array_i4 * const restrict retarray,
+ gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_INTEGER_4 * restrict abase;
+ const GFC_INTEGER_4 * restrict bbase;
+ GFC_INTEGER_4 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_INTEGER_4 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_INTEGER_4 *a, *b;
+ GFC_INTEGER_4 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_INTEGER_4 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_INTEGER_4)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_INTEGER_4 *restrict abase_x;
+ const GFC_INTEGER_4 *restrict bbase_y;
+ GFC_INTEGER_4 *restrict dest_y;
+ GFC_INTEGER_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_INTEGER_4 *restrict bbase_y;
+ GFC_INTEGER_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_INTEGER_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_INTEGER_4 *restrict bbase_y;
+ GFC_INTEGER_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_INTEGER_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_INTEGER_4 *restrict abase_x;
+ const GFC_INTEGER_4 *restrict bbase_y;
+ GFC_INTEGER_4 *restrict dest_y;
+ GFC_INTEGER_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_i4_avx128_fma4 (gfc_array_i4 * const restrict retarray,
+ gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_i4_avx128_fma4);
+void
+matmul_i4_avx128_fma4 (gfc_array_i4 * const restrict retarray,
+ gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_INTEGER_4 * restrict abase;
+ const GFC_INTEGER_4 * restrict bbase;
+ GFC_INTEGER_4 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_INTEGER_4 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_INTEGER_4 *a, *b;
+ GFC_INTEGER_4 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_INTEGER_4 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_INTEGER_4)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_INTEGER_4 *restrict abase_x;
+ const GFC_INTEGER_4 *restrict bbase_y;
+ GFC_INTEGER_4 *restrict dest_y;
+ GFC_INTEGER_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_INTEGER_4 *restrict bbase_y;
+ GFC_INTEGER_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_INTEGER_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_INTEGER_4 *restrict bbase_y;
+ GFC_INTEGER_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_INTEGER_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_INTEGER_4 *restrict abase_x;
+ const GFC_INTEGER_4 *restrict bbase_y;
+ GFC_INTEGER_4 *restrict dest_y;
+ GFC_INTEGER_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#endif
+
diff --git a/libgfortran/generated/matmulavx128_i8.c b/libgfortran/generated/matmulavx128_i8.c
new file mode 100644
index 00000000000..bc78ffe2779
--- /dev/null
+++ b/libgfortran/generated/matmulavx128_i8.c
@@ -0,0 +1,1152 @@
+/* Implementation of the MATMUL intrinsic
+ Copyright (C) 2002-2017 Free Software Foundation, Inc.
+ Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+#include "libgfortran.h"
+#include <string.h>
+#include <assert.h>
+
+
+/* These are the specific versions of matmul with -mprefer-avx128. */
+
+#if defined (HAVE_GFC_INTEGER_8)
+
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+ passed to us by the front-end, in which case we call it for large
+ matrices. */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+ const int *, const GFC_INTEGER_8 *, const GFC_INTEGER_8 *,
+ const int *, const GFC_INTEGER_8 *, const int *,
+ const GFC_INTEGER_8 *, GFC_INTEGER_8 *, const int *,
+ int, int);
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_i8_avx128_fma3 (gfc_array_i8 * const restrict retarray,
+ gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_i8_avx128_fma3);
+void
+matmul_i8_avx128_fma3 (gfc_array_i8 * const restrict retarray,
+ gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_INTEGER_8 * restrict abase;
+ const GFC_INTEGER_8 * restrict bbase;
+ GFC_INTEGER_8 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_INTEGER_8 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_INTEGER_8 *a, *b;
+ GFC_INTEGER_8 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_INTEGER_8 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_INTEGER_8)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_INTEGER_8 *restrict abase_x;
+ const GFC_INTEGER_8 *restrict bbase_y;
+ GFC_INTEGER_8 *restrict dest_y;
+ GFC_INTEGER_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_INTEGER_8 *restrict bbase_y;
+ GFC_INTEGER_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_INTEGER_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_INTEGER_8 *restrict bbase_y;
+ GFC_INTEGER_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_INTEGER_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_INTEGER_8 *restrict abase_x;
+ const GFC_INTEGER_8 *restrict bbase_y;
+ GFC_INTEGER_8 *restrict dest_y;
+ GFC_INTEGER_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_i8_avx128_fma4 (gfc_array_i8 * const restrict retarray,
+ gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_i8_avx128_fma4);
+void
+matmul_i8_avx128_fma4 (gfc_array_i8 * const restrict retarray,
+ gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_INTEGER_8 * restrict abase;
+ const GFC_INTEGER_8 * restrict bbase;
+ GFC_INTEGER_8 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_INTEGER_8 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_INTEGER_8 *a, *b;
+ GFC_INTEGER_8 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_INTEGER_8 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_INTEGER_8)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_INTEGER_8 *restrict abase_x;
+ const GFC_INTEGER_8 *restrict bbase_y;
+ GFC_INTEGER_8 *restrict dest_y;
+ GFC_INTEGER_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_INTEGER_8 *restrict bbase_y;
+ GFC_INTEGER_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_INTEGER_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_INTEGER_8 *restrict bbase_y;
+ GFC_INTEGER_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_INTEGER_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_INTEGER_8 *restrict abase_x;
+ const GFC_INTEGER_8 *restrict bbase_y;
+ GFC_INTEGER_8 *restrict dest_y;
+ GFC_INTEGER_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#endif
+
diff --git a/libgfortran/generated/matmulavx128_r10.c b/libgfortran/generated/matmulavx128_r10.c
new file mode 100644
index 00000000000..943678d1b51
--- /dev/null
+++ b/libgfortran/generated/matmulavx128_r10.c
@@ -0,0 +1,1152 @@
+/* Implementation of the MATMUL intrinsic
+ Copyright (C) 2002-2017 Free Software Foundation, Inc.
+ Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+#include "libgfortran.h"
+#include <string.h>
+#include <assert.h>
+
+
+/* These are the specific versions of matmul with -mprefer-avx128. */
+
+#if defined (HAVE_GFC_REAL_10)
+
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+ passed to us by the front-end, in which case we call it for large
+ matrices. */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+ const int *, const GFC_REAL_10 *, const GFC_REAL_10 *,
+ const int *, const GFC_REAL_10 *, const int *,
+ const GFC_REAL_10 *, GFC_REAL_10 *, const int *,
+ int, int);
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_r10_avx128_fma3 (gfc_array_r10 * const restrict retarray,
+ gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_r10_avx128_fma3);
+void
+matmul_r10_avx128_fma3 (gfc_array_r10 * const restrict retarray,
+ gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_REAL_10 * restrict abase;
+ const GFC_REAL_10 * restrict bbase;
+ GFC_REAL_10 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_10));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_REAL_10 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_REAL_10 *a, *b;
+ GFC_REAL_10 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_REAL_10 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_REAL_10 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_REAL_10));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_REAL_10)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_REAL_10 *restrict abase_x;
+ const GFC_REAL_10 *restrict bbase_y;
+ GFC_REAL_10 *restrict dest_y;
+ GFC_REAL_10 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_REAL_10) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_REAL_10 *restrict bbase_y;
+ GFC_REAL_10 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_REAL_10) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_REAL_10)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_REAL_10 *restrict bbase_y;
+ GFC_REAL_10 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_REAL_10) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_REAL_10 *restrict abase_x;
+ const GFC_REAL_10 *restrict bbase_y;
+ GFC_REAL_10 *restrict dest_y;
+ GFC_REAL_10 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_REAL_10) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_r10_avx128_fma4 (gfc_array_r10 * const restrict retarray,
+ gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_r10_avx128_fma4);
+void
+matmul_r10_avx128_fma4 (gfc_array_r10 * const restrict retarray,
+ gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_REAL_10 * restrict abase;
+ const GFC_REAL_10 * restrict bbase;
+ GFC_REAL_10 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_10));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_REAL_10 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_REAL_10 *a, *b;
+ GFC_REAL_10 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_REAL_10 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_REAL_10 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_REAL_10));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_REAL_10)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_REAL_10 *restrict abase_x;
+ const GFC_REAL_10 *restrict bbase_y;
+ GFC_REAL_10 *restrict dest_y;
+ GFC_REAL_10 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_REAL_10) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_REAL_10 *restrict bbase_y;
+ GFC_REAL_10 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_REAL_10) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_REAL_10)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_REAL_10 *restrict bbase_y;
+ GFC_REAL_10 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_REAL_10) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_REAL_10 *restrict abase_x;
+ const GFC_REAL_10 *restrict bbase_y;
+ GFC_REAL_10 *restrict dest_y;
+ GFC_REAL_10 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_REAL_10) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#endif
+
diff --git a/libgfortran/generated/matmulavx128_r16.c b/libgfortran/generated/matmulavx128_r16.c
new file mode 100644
index 00000000000..3d5738bb6f5
--- /dev/null
+++ b/libgfortran/generated/matmulavx128_r16.c
@@ -0,0 +1,1152 @@
+/* Implementation of the MATMUL intrinsic
+ Copyright (C) 2002-2017 Free Software Foundation, Inc.
+ Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+#include "libgfortran.h"
+#include <string.h>
+#include <assert.h>
+
+
+/* These are the specific versions of matmul with -mprefer-avx128. */
+
+#if defined (HAVE_GFC_REAL_16)
+
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+ passed to us by the front-end, in which case we call it for large
+ matrices. */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+ const int *, const GFC_REAL_16 *, const GFC_REAL_16 *,
+ const int *, const GFC_REAL_16 *, const int *,
+ const GFC_REAL_16 *, GFC_REAL_16 *, const int *,
+ int, int);
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_r16_avx128_fma3 (gfc_array_r16 * const restrict retarray,
+ gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_r16_avx128_fma3);
+void
+matmul_r16_avx128_fma3 (gfc_array_r16 * const restrict retarray,
+ gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_REAL_16 * restrict abase;
+ const GFC_REAL_16 * restrict bbase;
+ GFC_REAL_16 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_REAL_16 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_REAL_16 *a, *b;
+ GFC_REAL_16 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_REAL_16 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_REAL_16 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_REAL_16));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_REAL_16)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_REAL_16 *restrict abase_x;
+ const GFC_REAL_16 *restrict bbase_y;
+ GFC_REAL_16 *restrict dest_y;
+ GFC_REAL_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_REAL_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_REAL_16 *restrict bbase_y;
+ GFC_REAL_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_REAL_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_REAL_16)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_REAL_16 *restrict bbase_y;
+ GFC_REAL_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_REAL_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_REAL_16 *restrict abase_x;
+ const GFC_REAL_16 *restrict bbase_y;
+ GFC_REAL_16 *restrict dest_y;
+ GFC_REAL_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_REAL_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_r16_avx128_fma4 (gfc_array_r16 * const restrict retarray,
+ gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_r16_avx128_fma4);
+void
+matmul_r16_avx128_fma4 (gfc_array_r16 * const restrict retarray,
+ gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_REAL_16 * restrict abase;
+ const GFC_REAL_16 * restrict bbase;
+ GFC_REAL_16 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_REAL_16 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_REAL_16 *a, *b;
+ GFC_REAL_16 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_REAL_16 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_REAL_16 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_REAL_16));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_REAL_16)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_REAL_16 *restrict abase_x;
+ const GFC_REAL_16 *restrict bbase_y;
+ GFC_REAL_16 *restrict dest_y;
+ GFC_REAL_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_REAL_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_REAL_16 *restrict bbase_y;
+ GFC_REAL_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_REAL_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_REAL_16)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_REAL_16 *restrict bbase_y;
+ GFC_REAL_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_REAL_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_REAL_16 *restrict abase_x;
+ const GFC_REAL_16 *restrict bbase_y;
+ GFC_REAL_16 *restrict dest_y;
+ GFC_REAL_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_REAL_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#endif
+
diff --git a/libgfortran/generated/matmulavx128_r4.c b/libgfortran/generated/matmulavx128_r4.c
new file mode 100644
index 00000000000..6c6da3994c9
--- /dev/null
+++ b/libgfortran/generated/matmulavx128_r4.c
@@ -0,0 +1,1152 @@
+/* Implementation of the MATMUL intrinsic
+ Copyright (C) 2002-2017 Free Software Foundation, Inc.
+ Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+#include "libgfortran.h"
+#include <string.h>
+#include <assert.h>
+
+
+/* These are the specific versions of matmul with -mprefer-avx128. */
+
+#if defined (HAVE_GFC_REAL_4)
+
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+ passed to us by the front-end, in which case we call it for large
+ matrices. */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+ const int *, const GFC_REAL_4 *, const GFC_REAL_4 *,
+ const int *, const GFC_REAL_4 *, const int *,
+ const GFC_REAL_4 *, GFC_REAL_4 *, const int *,
+ int, int);
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_r4_avx128_fma3 (gfc_array_r4 * const restrict retarray,
+ gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_r4_avx128_fma3);
+void
+matmul_r4_avx128_fma3 (gfc_array_r4 * const restrict retarray,
+ gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_REAL_4 * restrict abase;
+ const GFC_REAL_4 * restrict bbase;
+ GFC_REAL_4 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_REAL_4 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_REAL_4 *a, *b;
+ GFC_REAL_4 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_REAL_4 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_REAL_4));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_REAL_4)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_REAL_4 *restrict abase_x;
+ const GFC_REAL_4 *restrict bbase_y;
+ GFC_REAL_4 *restrict dest_y;
+ GFC_REAL_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_REAL_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_REAL_4 *restrict bbase_y;
+ GFC_REAL_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_REAL_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_REAL_4 *restrict bbase_y;
+ GFC_REAL_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_REAL_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_REAL_4 *restrict abase_x;
+ const GFC_REAL_4 *restrict bbase_y;
+ GFC_REAL_4 *restrict dest_y;
+ GFC_REAL_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_REAL_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_r4_avx128_fma4 (gfc_array_r4 * const restrict retarray,
+ gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_r4_avx128_fma4);
+void
+matmul_r4_avx128_fma4 (gfc_array_r4 * const restrict retarray,
+ gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_REAL_4 * restrict abase;
+ const GFC_REAL_4 * restrict bbase;
+ GFC_REAL_4 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_REAL_4 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_REAL_4 *a, *b;
+ GFC_REAL_4 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_REAL_4 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_REAL_4));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_REAL_4)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_REAL_4 *restrict abase_x;
+ const GFC_REAL_4 *restrict bbase_y;
+ GFC_REAL_4 *restrict dest_y;
+ GFC_REAL_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_REAL_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_REAL_4 *restrict bbase_y;
+ GFC_REAL_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_REAL_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_REAL_4 *restrict bbase_y;
+ GFC_REAL_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_REAL_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_REAL_4 *restrict abase_x;
+ const GFC_REAL_4 *restrict bbase_y;
+ GFC_REAL_4 *restrict dest_y;
+ GFC_REAL_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_REAL_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#endif
+
diff --git a/libgfortran/generated/matmulavx128_r8.c b/libgfortran/generated/matmulavx128_r8.c
new file mode 100644
index 00000000000..d628200e8e3
--- /dev/null
+++ b/libgfortran/generated/matmulavx128_r8.c
@@ -0,0 +1,1152 @@
+/* Implementation of the MATMUL intrinsic
+ Copyright (C) 2002-2017 Free Software Foundation, Inc.
+ Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+#include "libgfortran.h"
+#include <string.h>
+#include <assert.h>
+
+
+/* These are the specific versions of matmul with -mprefer-avx128. */
+
+#if defined (HAVE_GFC_REAL_8)
+
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+ passed to us by the front-end, in which case we call it for large
+ matrices. */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+ const int *, const GFC_REAL_8 *, const GFC_REAL_8 *,
+ const int *, const GFC_REAL_8 *, const int *,
+ const GFC_REAL_8 *, GFC_REAL_8 *, const int *,
+ int, int);
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_r8_avx128_fma3 (gfc_array_r8 * const restrict retarray,
+ gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_r8_avx128_fma3);
+void
+matmul_r8_avx128_fma3 (gfc_array_r8 * const restrict retarray,
+ gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_REAL_8 * restrict abase;
+ const GFC_REAL_8 * restrict bbase;
+ GFC_REAL_8 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_8));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_REAL_8 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_REAL_8 *a, *b;
+ GFC_REAL_8 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_REAL_8 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_REAL_8 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_REAL_8));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_REAL_8)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_REAL_8 *restrict abase_x;
+ const GFC_REAL_8 *restrict bbase_y;
+ GFC_REAL_8 *restrict dest_y;
+ GFC_REAL_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_REAL_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_REAL_8 *restrict bbase_y;
+ GFC_REAL_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_REAL_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_REAL_8)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_REAL_8 *restrict bbase_y;
+ GFC_REAL_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_REAL_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_REAL_8 *restrict abase_x;
+ const GFC_REAL_8 *restrict bbase_y;
+ GFC_REAL_8 *restrict dest_y;
+ GFC_REAL_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_REAL_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_r8_avx128_fma4 (gfc_array_r8 * const restrict retarray,
+ gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_r8_avx128_fma4);
+void
+matmul_r8_avx128_fma4 (gfc_array_r8 * const restrict retarray,
+ gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+{
+ const GFC_REAL_8 * restrict abase;
+ const GFC_REAL_8 * restrict bbase;
+ GFC_REAL_8 * restrict dest;
+
+ index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
+ index_type x, y, n, count, xcount, ycount;
+
+ assert (GFC_DESCRIPTOR_RANK (a) == 2
+ || GFC_DESCRIPTOR_RANK (b) == 2);
+
+/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
+
+ Either A or B (but not both) can be rank 1:
+
+ o One-dimensional argument A is implicitly treated as a row matrix
+ dimensioned [1,count], so xcount=1.
+
+ o One-dimensional argument B is implicitly treated as a column matrix
+ dimensioned [count, 1], so ycount=1.
+*/
+
+ if (retarray->base_addr == NULL)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+ }
+ else
+ {
+ GFC_DIMENSION_SET(retarray->dim[0], 0,
+ GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
+
+ GFC_DIMENSION_SET(retarray->dim[1], 0,
+ GFC_DESCRIPTOR_EXTENT(b,1) - 1,
+ GFC_DESCRIPTOR_EXTENT(retarray,0));
+ }
+
+ retarray->base_addr
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_8));
+ retarray->offset = 0;
+ }
+ else if (unlikely (compile_options.bounds_check))
+ {
+ index_type ret_extent, arg_extent;
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic: is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ else
+ {
+ arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 1:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+
+ arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
+ ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
+ if (arg_extent != ret_extent)
+ runtime_error ("Incorrect extent in return array in"
+ " MATMUL intrinsic for dimension 2:"
+ " is %ld, should be %ld",
+ (long int) ret_extent, (long int) arg_extent);
+ }
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (retarray) == 1)
+ {
+ /* One-dimensional result may be addressed in the code below
+ either as a row or a column matrix. We want both cases to
+ work. */
+ rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ }
+ else
+ {
+ rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
+ rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
+ }
+
+
+ if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ /* Treat it as a a row matrix A[1,count]. */
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = 1;
+
+ xcount = 1;
+ count = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+ else
+ {
+ axstride = GFC_DESCRIPTOR_STRIDE(a,0);
+ aystride = GFC_DESCRIPTOR_STRIDE(a,1);
+
+ count = GFC_DESCRIPTOR_EXTENT(a,1);
+ xcount = GFC_DESCRIPTOR_EXTENT(a,0);
+ }
+
+ if (count != GFC_DESCRIPTOR_EXTENT(b,0))
+ {
+ if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
+ runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
+ }
+
+ if (GFC_DESCRIPTOR_RANK (b) == 1)
+ {
+ /* Treat it as a column matrix B[count,1] */
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+
+ /* bystride should never be used for 1-dimensional b.
+ The value is only used for calculation of the
+ memory by the buffer. */
+ bystride = 256;
+ ycount = 1;
+ }
+ else
+ {
+ bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
+ bystride = GFC_DESCRIPTOR_STRIDE(b,1);
+ ycount = GFC_DESCRIPTOR_EXTENT(b,1);
+ }
+
+ abase = a->base_addr;
+ bbase = b->base_addr;
+ dest = retarray->base_addr;
+
+ /* Now that everything is set up, we perform the multiplication
+ itself. */
+
+#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+ if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
+ && (bxstride == 1 || bystride == 1)
+ && (((float) xcount) * ((float) ycount) * ((float) count)
+ > POW3(blas_limit)))
+ {
+ const int m = xcount, n = ycount, k = count, ldc = rystride;
+ const GFC_REAL_8 one = 1, zero = 0;
+ const int lda = (axstride == 1) ? aystride : axstride,
+ ldb = (bxstride == 1) ? bystride : bxstride;
+
+ if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
+ {
+ assert (gemm != NULL);
+ gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
+ &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
+ &ldc, 1, 1);
+ return;
+ }
+ }
+
+ if (rxstride == 1 && axstride == 1 && bxstride == 1)
+ {
+ /* This block of code implements a tuned matmul, derived from
+ Superscalar GEMM-based level 3 BLAS, Beta version 0.1
+
+ Bo Kagstrom and Per Ling
+ Department of Computing Science
+ Umea University
+ S-901 87 Umea, Sweden
+
+ from netlib.org, translated to C, and modified for matmul.m4. */
+
+ const GFC_REAL_8 *a, *b;
+ GFC_REAL_8 *c;
+ const index_type m = xcount, n = ycount, k = count;
+
+ /* System generated locals */
+ index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
+ i1, i2, i3, i4, i5, i6;
+
+ /* Local variables */
+ GFC_REAL_8 f11, f12, f21, f22, f31, f32, f41, f42,
+ f13, f14, f23, f24, f33, f34, f43, f44;
+ index_type i, j, l, ii, jj, ll;
+ index_type isec, jsec, lsec, uisec, ujsec, ulsec;
+ GFC_REAL_8 *t1;
+
+ a = abase;
+ b = bbase;
+ c = retarray->base_addr;
+
+ /* Parameter adjustments */
+ c_dim1 = rystride;
+ c_offset = 1 + c_dim1;
+ c -= c_offset;
+ a_dim1 = aystride;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = bystride;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+
+ /* Early exit if possible */
+ if (m == 0 || n == 0 || k == 0)
+ return;
+
+ /* Adjust size of t1 to what is needed. */
+ index_type t1_dim;
+ t1_dim = (a_dim1-1) * 256 + b_dim1;
+ if (t1_dim > 65536)
+ t1_dim = 65536;
+
+ t1 = malloc (t1_dim * sizeof(GFC_REAL_8));
+
+ /* Empty c first. */
+ for (j=1; j<=n; j++)
+ for (i=1; i<=m; i++)
+ c[i + j * c_dim1] = (GFC_REAL_8)0;
+
+ /* Start turning the crank. */
+ i1 = n;
+ for (jj = 1; jj <= i1; jj += 512)
+ {
+ /* Computing MIN */
+ i2 = 512;
+ i3 = n - jj + 1;
+ jsec = min(i2,i3);
+ ujsec = jsec - jsec % 4;
+ i2 = k;
+ for (ll = 1; ll <= i2; ll += 256)
+ {
+ /* Computing MIN */
+ i3 = 256;
+ i4 = k - ll + 1;
+ lsec = min(i3,i4);
+ ulsec = lsec - lsec % 2;
+
+ i3 = m;
+ for (ii = 1; ii <= i3; ii += 256)
+ {
+ /* Computing MIN */
+ i4 = 256;
+ i5 = m - ii + 1;
+ isec = min(i4,i5);
+ uisec = isec - isec % 2;
+ i4 = ll + ulsec - 1;
+ for (l = ll; l <= i4; l += 2)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 2)
+ {
+ t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
+ a[i + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
+ a[i + (l + 1) * a_dim1];
+ t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + l * a_dim1];
+ t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
+ a[i + 1 + (l + 1) * a_dim1];
+ }
+ if (uisec < isec)
+ {
+ t1[l - ll + 1 + (isec << 8) - 257] =
+ a[ii + isec - 1 + l * a_dim1];
+ t1[l - ll + 2 + (isec << 8) - 257] =
+ a[ii + isec - 1 + (l + 1) * a_dim1];
+ }
+ }
+ if (ulsec < lsec)
+ {
+ i4 = ii + isec - 1;
+ for (i = ii; i<= i4; ++i)
+ {
+ t1[lsec + ((i - ii + 1) << 8) - 257] =
+ a[i + (ll + lsec - 1) * a_dim1];
+ }
+ }
+
+ uisec = isec - isec % 4;
+ i4 = jj + ujsec - 1;
+ for (j = jj; j <= i4; j += 4)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f22 = c[i + 1 + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f23 = c[i + 1 + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ f24 = c[i + 1 + (j + 3) * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ f32 = c[i + 2 + (j + 1) * c_dim1];
+ f42 = c[i + 3 + (j + 1) * c_dim1];
+ f33 = c[i + 2 + (j + 2) * c_dim1];
+ f43 = c[i + 3 + (j + 2) * c_dim1];
+ f34 = c[i + 2 + (j + 3) * c_dim1];
+ f44 = c[i + 3 + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + j * b_dim1];
+ f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 1) * b_dim1];
+ f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 2) * b_dim1];
+ f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
+ * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + 1 + (j + 1) * c_dim1] = f22;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + 1 + (j + 2) * c_dim1] = f23;
+ c[i + (j + 3) * c_dim1] = f14;
+ c[i + 1 + (j + 3) * c_dim1] = f24;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ c[i + 2 + (j + 1) * c_dim1] = f32;
+ c[i + 3 + (j + 1) * c_dim1] = f42;
+ c[i + 2 + (j + 2) * c_dim1] = f33;
+ c[i + 3 + (j + 2) * c_dim1] = f43;
+ c[i + 2 + (j + 3) * c_dim1] = f34;
+ c[i + 3 + (j + 3) * c_dim1] = f44;
+ }
+ if (uisec < isec)
+ {
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ f12 = c[i + (j + 1) * c_dim1];
+ f13 = c[i + (j + 2) * c_dim1];
+ f14 = c[i + (j + 3) * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 1) * b_dim1];
+ f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 2) * b_dim1];
+ f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + (j + 3) * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + (j + 1) * c_dim1] = f12;
+ c[i + (j + 2) * c_dim1] = f13;
+ c[i + (j + 3) * c_dim1] = f14;
+ }
+ }
+ }
+ if (ujsec < jsec)
+ {
+ i4 = jj + jsec - 1;
+ for (j = jj + ujsec; j <= i4; ++j)
+ {
+ i5 = ii + uisec - 1;
+ for (i = ii; i <= i5; i += 4)
+ {
+ f11 = c[i + j * c_dim1];
+ f21 = c[i + 1 + j * c_dim1];
+ f31 = c[i + 2 + j * c_dim1];
+ f41 = c[i + 3 + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
+ 257] * b[l + j * b_dim1];
+ f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
+ 257] * b[l + j * b_dim1];
+ f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ c[i + 1 + j * c_dim1] = f21;
+ c[i + 2 + j * c_dim1] = f31;
+ c[i + 3 + j * c_dim1] = f41;
+ }
+ i5 = ii + isec - 1;
+ for (i = ii + uisec; i <= i5; ++i)
+ {
+ f11 = c[i + j * c_dim1];
+ i6 = ll + lsec - 1;
+ for (l = ll; l <= i6; ++l)
+ {
+ f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
+ 257] * b[l + j * b_dim1];
+ }
+ c[i + j * c_dim1] = f11;
+ }
+ }
+ }
+ }
+ }
+ }
+ free(t1);
+ return;
+ }
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ if (GFC_DESCRIPTOR_RANK (a) != 1)
+ {
+ const GFC_REAL_8 *restrict abase_x;
+ const GFC_REAL_8 *restrict bbase_y;
+ GFC_REAL_8 *restrict dest_y;
+ GFC_REAL_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_REAL_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else
+ {
+ const GFC_REAL_8 *restrict bbase_y;
+ GFC_REAL_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_REAL_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n];
+ dest[y*rystride] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
+ {
+ for (y = 0; y < ycount; y++)
+ for (x = 0; x < xcount; x++)
+ dest[x*rxstride + y*rystride] = (GFC_REAL_8)0;
+
+ for (y = 0; y < ycount; y++)
+ for (n = 0; n < count; n++)
+ for (x = 0; x < xcount; x++)
+ /* dest[x,y] += a[x,n] * b[n,y] */
+ dest[x*rxstride + y*rystride] +=
+ abase[x*axstride + n*aystride] *
+ bbase[n*bxstride + y*bystride];
+ }
+ else if (GFC_DESCRIPTOR_RANK (a) == 1)
+ {
+ const GFC_REAL_8 *restrict bbase_y;
+ GFC_REAL_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ s = (GFC_REAL_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase[n*axstride] * bbase_y[n*bxstride];
+ dest[y*rxstride] = s;
+ }
+ }
+ else
+ {
+ const GFC_REAL_8 *restrict abase_x;
+ const GFC_REAL_8 *restrict bbase_y;
+ GFC_REAL_8 *restrict dest_y;
+ GFC_REAL_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_REAL_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
+}
+#undef POW3
+#undef min
+#undef max
+
+#endif
+
+#endif
+
diff --git a/libgfortran/m4/matmul.m4 b/libgfortran/m4/matmul.m4
index 7976fda8bb4..c2f641542bf 100644
--- a/libgfortran/m4/matmul.m4
+++ b/libgfortran/m4/matmul.m4
@@ -106,6 +106,26 @@ static' include(matmul_internal.m4)dnl
static' include(matmul_internal.m4)dnl
`#endif /* HAVE_AVX512F */
+/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl
+`void
+'matmul_name` ('rtype` * const restrict retarray,
+ 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto('matmul_name`);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl
+`void
+'matmul_name` ('rtype` * const restrict retarray,
+ 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto('matmul_name`);
+#endif
+
/* Function to fall back to if there is no special processor-specific version. */
'define(`matmul_name',`matmul_'rtype_code`_vanilla')dnl
`static' include(matmul_internal.m4)dnl
@@ -161,6 +181,26 @@ void matmul_'rtype_code` ('rtype` * const restrict retarray,
}
#endif /* HAVE_AVX */
}
+ else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+ {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+ {
+ matmul_fn = matmul_'rtype_code`_avx128_fma3;
+ goto store;
+ }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+ {
+ matmul_fn = matmul_'rtype_code`_avx128_fma4;
+ goto store;
+ }
+#endif
+
+ }
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
diff --git a/libgfortran/m4/matmulavx128.m4 b/libgfortran/m4/matmulavx128.m4
new file mode 100644
index 00000000000..14172843579
--- /dev/null
+++ b/libgfortran/m4/matmulavx128.m4
@@ -0,0 +1,67 @@
+`/* Implementation of the MATMUL intrinsic
+ Copyright (C) 2002-2017 Free Software Foundation, Inc.
+ Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+#include "libgfortran.h"
+#include <string.h>
+#include <assert.h>'
+
+include(iparm.m4)dnl
+
+/* These are the specific versions of matmul with -mprefer-avx128. */
+
+`#if defined (HAVE_'rtype_name`)
+
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+ passed to us by the front-end, in which case we call it for large
+ matrices. */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+ const int *, const 'rtype_name` *, const 'rtype_name` *,
+ const int *, const 'rtype_name` *, const int *,
+ const 'rtype_name` *, 'rtype_name` *, const int *,
+ int, int);
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl
+`void
+'matmul_name` ('rtype` * const restrict retarray,
+ 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto('matmul_name`);
+'include(matmul_internal.m4)dnl
+`#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl
+`void
+'matmul_name` ('rtype` * const restrict retarray,
+ 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto('matmul_name`);
+'include(matmul_internal.m4)dnl
+`#endif
+
+#endif
+'