summaryrefslogtreecommitdiff
path: root/libhsail-rt/rt
diff options
context:
space:
mode:
authorPekka Jääskeläinen <pekka@parmance.com>2017-01-24 12:45:56 +0000
committerMartin Jambor <jamborm@gcc.gnu.org>2017-01-24 13:45:56 +0100
commit5fd1486ce58297190c2b924e96e716087139a8b5 (patch)
tree80abae8778b2f25cc8bf5960402f20f16e4e7a8c /libhsail-rt/rt
parente1e41b6f10c76dbdc8bfd2d4a345dffefd45968f (diff)
Brig front-end
2017-01-24 Pekka Jääskeläinen <pekka@parmance.com> Martin Jambor <mjambor@suse.cz> * Makefile.def (target_modules): Added libhsail-rt. (languages): Added language brig. * Makefile.in: Regenerated. * configure.ac (TOPLEVEL_CONFIGURE_ARGUMENTS): Added tgarget-libhsail-rt. Make brig unsupported on untested architectures. * configure: Regenerated. gcc/ * brig-builtins.def: New file. * builtins.def (DEF_HSAIL_BUILTIN): New macro. (DEF_HSAIL_ATOMIC_BUILTIN): Likewise. (DEF_HSAIL_SAT_BUILTIN): Likewise. (DEF_HSAIL_INTR_BUILTIN): Likewise. (DEF_HSAIL_CVT_ZEROI_SAT_BUILTIN): Likewise. * builtin-types.def (BT_INT8): New. (BT_INT16): Likewise. (BT_UINT8): Likewise. (BT_UINT16): Likewise. (BT_FN_ULONG): Likewise. (BT_FN_UINT_INT): Likewise. (BT_FN_UINT_ULONG): Likewise. (BT_FN_UINT_LONG): Likewise. (BT_FN_UINT_PTR): Likewise. (BT_FN_ULONG_PTR): Likewise. (BT_FN_INT8_FLOAT): Likewise. (BT_FN_INT16_FLOAT): Likewise. (BT_FN_UINT32_FLOAT): Likewise. (BT_FN_UINT16_FLOAT): Likewise. (BT_FN_UINT8_FLOAT): Likewise. (BT_FN_UINT64_FLOAT): Likewise. (BT_FN_UINT16_UINT32): Likewise. (BT_FN_UINT32_UINT16): Likewise. (BT_FN_UINT16_UINT16_UINT16): Likewise. (BT_FN_INT_PTR_INT): Likewise. (BT_FN_UINT_PTR_UINT): Likewise. (BT_FN_LONG_PTR_LONG): Likewise. (BT_FN_ULONG_PTR_ULONG): Likewise. (BT_FN_VOID_UINT64_UINT64): Likewise. (BT_FN_UINT8_UINT8_UINT8): Likewise. (BT_FN_INT8_INT8_INT8): Likewise. (BT_FN_INT16_INT16_INT16): Likewise. (BT_FN_INT_INT_INT): Likewise. (BT_FN_UINT_FLOAT_UINT): Likewise. (BT_FN_FLOAT_UINT_UINT): Likewise. (BT_FN_ULONG_UINT_UINT): Likewise. (BT_FN_ULONG_UINT_PTR): Likewise. (BT_FN_ULONG_ULONG_ULONG): Likewise. (BT_FN_UINT_UINT_UINT): Likewise. (BT_FN_VOID_UINT_PTR): Likewise. (BT_FN_UINT_UINT_PTR: Likewise. (BT_FN_UINT32_UINT64_PTR): Likewise. (BT_FN_INT_INT_UINT_UINT): Likewise. (BT_FN_UINT_UINT_UINT_UINT): Likewise. (BT_FN_UINT_UINT_UINT_PTR): Likewise. (BT_FN_UINT_ULONG_ULONG_UINT): Likewise. (BT_FN_ULONG_ULONG_ULONG_ULONG): Likewise. (BT_FN_LONG_LONG_UINT_UINT): Likewise. (BT_FN_ULONG_ULONG_UINT_UINT): Likewise. (BT_FN_VOID_UINT32_UINT64_PTR): Likewise. (BT_FN_VOID_UINT32_UINT32_PTR): Likewise. (BT_FN_UINT_UINT_UINT_UINT_UINT): Likewise. (BT_FN_UINT_FLOAT_FLOAT_FLOAT_FLOAT): Likewise. (BT_FN_ULONG_ULONG_ULONG_UINT_UINT): Likewise. * doc/frontends.texi: List BRIG FE. * doc/install.texi (Testing): Add BRIG tesring requirements. * doc/invoke.texi (Overall Options): Mention BRIG. * doc/standards.texi (Standards): Doucment BRIG HSA version. gcc/brig/ * Make-lang.in: New file. * brig-builtins.h: Likewise. * brig-c.h: Likewise. * brig-lang.c: Likewise. * brigspec.c: Likewise. * config-lang.in: Likewise. * lang-specs.h: Likewise. * lang.opt: Likewise. * brigfrontend/brig-arg-block-handler.cc: Likewise. * brigfrontend/brig-atomic-inst-handler.cc: Likewise. * brigfrontend/brig-basic-inst-handler.cc: Likewise. * brigfrontend/brig-branch-inst-handler.cc: Likewise. * brigfrontend/brig-cmp-inst-handler.cc: Likewise. * brigfrontend/brig-code-entry-handler.cc: Likewise. * brigfrontend/brig-code-entry-handler.h: Likewise. * brigfrontend/brig-comment-handler.cc: Likewise. * brigfrontend/brig-control-handler.cc: Likewise. * brigfrontend/brig-copy-move-inst-handler.cc: Likewise. * brigfrontend/brig-cvt-inst-handler.cc: Likewise. * brigfrontend/brig-fbarrier-handler.cc: Likewise. * brigfrontend/brig-function-handler.cc: Likewise. * brigfrontend/brig-function.cc: Likewise. * brigfrontend/brig-function.h: Likewise. * brigfrontend/brig-inst-mod-handler.cc: Likewise. * brigfrontend/brig-label-handler.cc: Likewise. * brigfrontend/brig-lane-inst-handler.cc: Likewise. * brigfrontend/brig-machine.c: Likewise. * brigfrontend/brig-machine.h: Likewise. * brigfrontend/brig-mem-inst-handler.cc: Likewise. * brigfrontend/brig-module-handler.cc: Likewise. * brigfrontend/brig-queue-inst-handler.cc: Likewise. * brigfrontend/brig-seg-inst-handler.cc: Likewise. * brigfrontend/brig-signal-inst-handler.cc: Likewise. * brigfrontend/brig-to-generic.cc: Likewise. * brigfrontend/brig-to-generic.h: Likewise. * brigfrontend/brig-util.cc: Likewise. * brigfrontend/brig-util.h: Likewise. * brigfrontend/brig-variable-handler.cc: Likewise. * brigfrontend/phsa.h: Likewise. gcc/testsuite/ * lib/brig-dg.exp: New file. * lib/brig.exp: Likewise. * brig.dg/README: Likewise. * brig.dg/dg.exp: Likewise. * brig.dg/test/gimple/alloca.hsail: Likewise. * brig.dg/test/gimple/atomics.hsail: Likewise. * brig.dg/test/gimple/branches.hsail: Likewise. * brig.dg/test/gimple/fbarrier.hsail: Likewise. * brig.dg/test/gimple/function_calls.hsail: Likewise. * brig.dg/test/gimple/kernarg.hsail: Likewise. * brig.dg/test/gimple/mem.hsail: Likewise. * brig.dg/test/gimple/mulhi.hsail: Likewise. * brig.dg/test/gimple/packed.hsail: Likewise. * brig.dg/test/gimple/smoke_test.hsail: Likewise. * brig.dg/test/gimple/variables.hsail: Likewise. * brig.dg/test/gimple/vector.hsail: Likewise. include/ * hsa.h: Moved here from libgomp/plugin/hsa.h. libgomp/ * plugin/hsa.h: Moved to top level include. * plugin/plugin-hsa.c: Chanfgd include of hsa.h accordingly. libhsail-rt/ * Makefile.am: New file. * target-config.h.in: Likewise. * configure.ac: Likewise. * configure: Likewise. * config.h.in: Likewise. * aclocal.m4: Likewise. * README: Likewise. * Makefile.in: Likewise. * include/internal/fibers.h: Likewise. * include/internal/phsa-queue-interface.h: Likewise. * include/internal/phsa-rt.h: Likewise. * include/internal/workitems.h: Likewise. * rt/arithmetic.c: Likewise. * rt/atomics.c: Likewise. * rt/bitstring.c: Likewise. * rt/fbarrier.c: Likewise. * rt/fibers.c: Likewise. * rt/fp16.c: Likewise. * rt/misc.c: Likewise. * rt/multimedia.c: Likewise. * rt/queue.c: Likewise. * rt/sat_arithmetic.c: Likewise. * rt/segment.c: Likewise. * rt/workitems.c: Likewise. Co-Authored-By: Martin Jambor <mjambor@suse.cz> From-SVN: r244867
Diffstat (limited to 'libhsail-rt/rt')
-rw-r--r--libhsail-rt/rt/arithmetic.c475
-rw-r--r--libhsail-rt/rt/atomics.c115
-rw-r--r--libhsail-rt/rt/bitstring.c190
-rw-r--r--libhsail-rt/rt/fbarrier.c87
-rw-r--r--libhsail-rt/rt/fibers.c220
-rw-r--r--libhsail-rt/rt/fp16.c135
-rw-r--r--libhsail-rt/rt/misc.c89
-rw-r--r--libhsail-rt/rt/multimedia.c135
-rw-r--r--libhsail-rt/rt/queue.c71
-rw-r--r--libhsail-rt/rt/sat_arithmetic.c299
-rw-r--r--libhsail-rt/rt/segment.c57
-rw-r--r--libhsail-rt/rt/workitems.c952
12 files changed, 2825 insertions, 0 deletions
diff --git a/libhsail-rt/rt/arithmetic.c b/libhsail-rt/rt/arithmetic.c
new file mode 100644
index 00000000000..6749752e6e9
--- /dev/null
+++ b/libhsail-rt/rt/arithmetic.c
@@ -0,0 +1,475 @@
+/* arithmetic.c -- Builtins for HSAIL arithmetic instructions for which
+ there is no feasible direct gcc GENERIC expression.
+
+ Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
+ for General Processor Tech.
+
+ Permission is hereby granted, free of charge, to any person obtaining a
+ copy of this software and associated documentation files
+ (the "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be included
+ in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include <stdio.h>
+#include <stdint.h>
+#include <limits.h>
+#include <math.h>
+#include <float.h>
+
+/* HSAIL defines INT_MIN % -1 to be 0 while with C it's undefined,
+ and causes an overflow exception at least with gcc and C on IA-32. */
+
+int32_t
+__hsail_rem_s32 (int32_t dividend, int32_t divisor)
+{
+ if (dividend == INT_MIN && divisor == -1)
+ return 0;
+ else
+ return dividend % divisor;
+}
+
+int64_t
+__hsail_rem_s64 (int64_t dividend, int64_t divisor)
+{
+ if (dividend == INT64_MIN && divisor == -1)
+ return 0;
+ else
+ return dividend % divisor;
+}
+
+/* HSAIL has defined behavior for min and max when one of the operands is
+ NaN: in that case the other operand is returned. In C and with gcc's
+ MIN_EXPR/MAX_EXPR, the returned operand is undefined. */
+
+float
+__hsail_min_f32 (float a, float b)
+{
+ if (isnan (a))
+ return b;
+ else if (isnan (b))
+ return a;
+ else if (a == 0.0f && b == 0.0f)
+ return signbit (a) ? a : b;
+ else if (a > b)
+ return b;
+ else
+ return a;
+}
+
+double
+__hsail_min_f64 (double a, double b)
+{
+ if (isnan (a))
+ return b;
+ else if (isnan (b))
+ return a;
+ else if (a > b)
+ return b;
+ else
+ return a;
+}
+
+float
+__hsail_max_f32 (float a, float b)
+{
+ if (isnan (a))
+ return b;
+ else if (isnan (b))
+ return a;
+ else if (a == 0.0f && b == 0.0f && signbit (a))
+ return b;
+ else if (a < b)
+ return b;
+ else
+ return a;
+}
+
+double
+__hsail_max_f64 (double a, double b)
+{
+ if (isnan (a))
+ return b;
+ else if (isnan (b))
+ return a;
+ else if (a == 0.0 && b == 0.0 && signbit (a))
+ return b;
+ else if (a < b)
+ return b;
+ else
+ return a;
+}
+
+uint8_t
+__hsail_cvt_zeroi_sat_u8_f32 (float a)
+{
+ if (isnan (a))
+ return 0;
+ if (a >= (float) UINT8_MAX)
+ return UINT8_MAX;
+ else if (a <= 0.0f)
+ return 0;
+ return (uint8_t) a;
+}
+
+int8_t
+__hsail_cvt_zeroi_sat_s8_f32 (float a)
+{
+ if (isnan (a))
+ return 0;
+ if (a >= (float) INT8_MAX)
+ return INT8_MAX;
+ if (a <= (float) INT8_MIN)
+ return INT8_MIN;
+ return (int8_t) a;
+}
+
+uint16_t
+__hsail_cvt_zeroi_sat_u16_f32 (float a)
+{
+ if (isnan (a))
+ return 0;
+ if (a >= (float) UINT16_MAX)
+ return UINT16_MAX;
+ else if (a <= 0.0f)
+ return 0;
+ return (uint16_t) a;
+}
+
+int16_t
+__hsail_cvt_zeroi_sat_s16_f32 (float a)
+{
+ if (isnan (a))
+ return 0;
+ if (a >= (float) INT16_MAX)
+ return INT16_MAX;
+ if (a <= (float) INT16_MIN)
+ return INT16_MIN;
+ return (int16_t) a;
+}
+
+uint32_t
+__hsail_cvt_zeroi_sat_u32_f32 (float a)
+{
+ if (isnan (a))
+ return 0;
+ if (a >= (float) UINT32_MAX)
+ return UINT32_MAX;
+ else if (a <= 0.0f)
+ return 0;
+ return (uint32_t) a;
+}
+
+int32_t
+__hsail_cvt_zeroi_sat_s32_f32 (float a)
+{
+ if (isnan (a))
+ return 0;
+ if (a >= (float) INT32_MAX)
+ return INT32_MAX;
+ if (a <= (float) INT32_MIN)
+ return INT32_MIN;
+ return (int32_t) a;
+}
+
+uint64_t
+__hsail_cvt_zeroi_sat_u64_f32 (float a)
+{
+ if (isnan (a))
+ return 0;
+ if (a >= (float) UINT64_MAX)
+ return UINT64_MAX;
+ else if (a <= 0.0f)
+ return 0;
+ return (uint64_t) a;
+}
+
+int64_t
+__hsail_cvt_zeroi_sat_s64_f32 (float a)
+{
+ if (isnan (a))
+ return 0;
+ if (a >= (float) INT64_MAX)
+ return INT64_MAX;
+ if (a <= (float) INT64_MIN)
+ return INT64_MIN;
+ return (int64_t) a;
+}
+
+uint8_t
+__hsail_cvt_zeroi_sat_u8_f64 (double a)
+{
+ if (isnan (a))
+ return 0;
+ if (a >= (double) UINT8_MAX)
+ return UINT8_MAX;
+ else if (a <= 0.0f)
+ return 0;
+ return (uint8_t) a;
+}
+
+int8_t
+__hsail_cvt_zeroi_sat_s8_f64 (double a)
+{
+ if (isnan (a))
+ return 0;
+ if (a >= (double) INT8_MAX)
+ return INT8_MAX;
+ if (a <= (double) INT8_MIN)
+ return INT8_MIN;
+ return (int8_t) a;
+}
+
+uint16_t
+__hsail_cvt_zeroi_sat_u16_f64 (double a)
+{
+ if (isnan (a))
+ return 0;
+ if (a >= (double) UINT16_MAX)
+ return UINT16_MAX;
+ else if (a <= 0.0f)
+ return 0;
+ return (uint16_t) a;
+}
+
+int16_t
+__hsail_cvt_zeroi_sat_s16_f64 (double a)
+{
+ if (isnan (a))
+ return 0;
+ if (a >= (double) INT16_MAX)
+ return INT16_MAX;
+ if (a <= (double) INT16_MIN)
+ return INT16_MIN;
+ return (int16_t) a;
+}
+
+uint32_t
+__hsail_cvt_zeroi_sat_u32_f64 (double a)
+{
+ if (isnan (a))
+ return 0;
+ if (a >= (double) UINT32_MAX)
+ return UINT32_MAX;
+ else if (a <= 0.0f)
+ return 0;
+ return (uint32_t) a;
+}
+
+int32_t
+__hsail_cvt_zeroi_sat_s32_f64 (double a)
+{
+ if (isnan (a))
+ return 0;
+ if (a >= (double) INT32_MAX)
+ return INT32_MAX;
+ if (a <= (double) INT32_MIN)
+ return INT32_MIN;
+ return (int32_t) a;
+}
+
+uint64_t
+__hsail_cvt_zeroi_sat_u64_f64 (double a)
+{
+ if (isnan (a))
+ return 0;
+ if (a >= (double) UINT64_MAX)
+ return UINT64_MAX;
+ else if (a <= 0.0f)
+ return 0;
+ return (uint64_t) a;
+}
+
+int64_t
+__hsail_cvt_zeroi_sat_s64_f64 (double a)
+{
+ if (isnan (a))
+ return 0;
+ if (a >= (double) INT64_MAX)
+ return INT64_MAX;
+ if (a <= (double) INT64_MIN)
+ return INT64_MIN;
+ return (int64_t) a;
+}
+
+
+/* Flush the operand to zero in case it's a denormalized number.
+ Do not cause any exceptions in case of NaNs. */
+
+float
+__hsail_ftz_f32 (float a)
+{
+ if (isnan (a) || isinf (a) || a == 0.0f)
+ return a;
+
+ if (a < 0.0f)
+ {
+ if (-a < FLT_MIN)
+ return -0.0f;
+ }
+ else
+ {
+ if (a < FLT_MIN)
+ return 0.0f;
+ }
+ return a;
+}
+
+#define F16_MIN (6.10e-5)
+
+/* Flush the single precision operand to zero in case it's considered
+ a denormalized number in case it was a f16. Do not cause any exceptions
+ in case of NaNs. */
+
+float
+__hsail_ftz_f32_f16 (float a)
+{
+ if (isnan (a) || isinf (a) || a == 0.0f)
+ return a;
+
+ if (a < 0.0f)
+ {
+ if (-a < F16_MIN)
+ return -0.0f;
+ }
+ else
+ {
+ if (a < F16_MIN)
+ return 0.0f;
+ }
+ return a;
+}
+
+double
+__hsail_ftz_f64 (double a)
+{
+ if (isnan (a) || isinf (a) || a == 0.0d)
+ return a;
+
+ if (a < 0.0d)
+ {
+ if (-a < DBL_MIN)
+ return -0.0d;
+ }
+ else
+ {
+ if (a < DBL_MIN)
+ return 0.0d;
+ }
+ return a;
+}
+
+uint32_t
+__hsail_borrow_u32 (uint32_t a, uint32_t b)
+{
+ uint64_t c = (uint64_t) a - (uint64_t) b;
+ if (c > UINT32_MAX)
+ return 1;
+ else
+ return 0;
+}
+
+uint64_t
+__hsail_borrow_u64 (uint64_t a, uint64_t b)
+{
+ __uint128_t c = (__uint128_t) a - (__uint128_t) b;
+ if (c > UINT64_MAX)
+ return 1;
+ else
+ return 0;
+}
+
+uint32_t
+__hsail_carry_u32 (uint32_t a, uint32_t b)
+{
+ uint64_t c = (uint64_t) a + (uint64_t) b;
+ if (c > UINT32_MAX)
+ return 1;
+ else
+ return 0;
+}
+
+uint64_t
+__hsail_carry_u64 (uint64_t a, uint64_t b)
+{
+ __uint128_t c = (__uint128_t) a + (__uint128_t) b;
+ if (c > UINT64_MAX)
+ return 1;
+ else
+ return 0;
+}
+
+float
+__hsail_fract_f32 (float a)
+{
+ int exp;
+ if (isinf (a))
+ return signbit (a) == 0 ? 0.0f : -0.0f;
+ if (isnan (a) || a == 0.0f)
+ return a;
+ else
+ return fminf (a - floorf (a), 0x1.fffffep-1f);
+}
+
+double
+__hsail_fract_f64 (double a)
+{
+ int exp;
+ if (isinf (a))
+ return 0.0f * isinf (a);
+ if (isnan (a) || a == 0.0f)
+ return a;
+ else
+ return fmin (a - floor (a), 0x1.fffffffffffffp-1d);
+}
+
+uint32_t
+__hsail_class_f32 (float a, uint32_t flags)
+{
+ return (flags & 0x0001 && isnan (a) && !(*(uint32_t *) &a & 0x40000000))
+ || (flags & 0x0002 && isnan (a) && (*(uint32_t *) &a & 0x40000000))
+ || (flags & 0x0004 && isinf (a) && a < 0.0f)
+ || (flags & 0x0008 && isnormal (a) && signbit (a))
+ || (flags & 0x0010 && a < 0.0f && a > -FLT_MIN)
+ || (flags & 0x0020 && a == 0.0f && signbit (a))
+ || (flags & 0x0040 && a == 0.0f && !signbit (a))
+ || (flags & 0x0080 && a > 0.0f && a < FLT_MIN)
+ || (flags & 0x0100 && isnormal (a) && !signbit (a))
+ || (flags & 0x0200 && isinf (a) && a >= 0.0f);
+}
+
+/* 'class' for a f32-converted f16 which should otherwise be treated like f32
+ except for its limits. */
+
+uint32_t
+__hsail_class_f32_f16 (float a, uint32_t flags)
+{
+ return (flags & 0x0001 && isnan (a) && !(*(uint32_t *) &a & 0x40000000))
+ || (flags & 0x0002 && isnan (a) && (*(uint32_t *) &a & 0x40000000))
+ || (flags & 0x0004 && isinf (a) && a < 0.0f)
+ || (flags & 0x0008 && a != 0.0f && !isinf (a) && !isnan (a)
+ && a <= -F16_MIN)
+ || (flags & 0x0010 && a != 0.0f && !isinf (a) && !isnan (a) && a < 0.0f
+ && a > -F16_MIN)
+ || (flags & 0x0020 && a == 0.0f && signbit (a))
+ || (flags & 0x0040 && a == 0.0f && !signbit (a))
+ || (flags & 0x0080 && a != 0.0f && !isinf (a) && !isnan (a) && a > 0.0f
+ && a < F16_MIN)
+ || (flags & 0x0100 && a != 0.0f && !isinf (a) && !isnan (a)
+ && a >= F16_MIN)
+ || (flags & 0x0200 && isinf (a) && a >= 0.0f);
+}
diff --git a/libhsail-rt/rt/atomics.c b/libhsail-rt/rt/atomics.c
new file mode 100644
index 00000000000..04f02f06aff
--- /dev/null
+++ b/libhsail-rt/rt/atomics.c
@@ -0,0 +1,115 @@
+/* atomic.c -- Builtins for HSAIL atomic instructions for which
+ there is no feasible direct gcc GENERIC expression.
+
+ Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
+ for General Processor Tech.
+
+ Permission is hereby granted, free of charge, to any person obtaining a
+ copy of this software and associated documentation files
+ (the "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be included
+ in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include <stdint.h>
+#include <stdio.h>
+
+#define DO_ATOMICALLY(T, OPERATION) \
+ int done = 0; \
+ T old_value; \
+ T new_value; \
+ while (!done) \
+ { \
+ old_value = *ptr; \
+ new_value = OPERATION; \
+ done = __sync_bool_compare_and_swap (ptr, old_value, new_value); \
+ } \
+ return old_value
+
+int32_t
+__hsail_atomic_min_s32 (int32_t *ptr, int32_t a)
+{
+ DO_ATOMICALLY (int32_t, (old_value < a) ? old_value : a);
+}
+
+int64_t
+__hsail_atomic_min_s64 (int64_t *ptr, int64_t a)
+{
+ DO_ATOMICALLY (int64_t, (old_value < a) ? old_value : a);
+}
+
+uint32_t
+__hsail_atomic_min_u32 (uint32_t *ptr, uint32_t a)
+{
+ DO_ATOMICALLY (uint32_t, (old_value < a) ? old_value : a);
+}
+
+uint64_t
+__hsail_atomic_min_u64 (uint64_t *ptr, uint64_t a)
+{
+ DO_ATOMICALLY (uint64_t, (old_value < a) ? old_value : a);
+}
+
+uint32_t
+__hsail_atomic_max_u32 (uint32_t *ptr, uint32_t a)
+{
+ DO_ATOMICALLY (uint32_t, (old_value > a) ? old_value : a);
+}
+
+int32_t
+__hsail_atomic_max_s32 (int32_t *ptr, int32_t a)
+{
+ DO_ATOMICALLY (int32_t, (old_value > a) ? old_value : a);
+}
+
+uint64_t
+__hsail_atomic_max_u64 (uint64_t *ptr, uint64_t a)
+{
+ DO_ATOMICALLY (uint64_t, (old_value > a) ? old_value : a);
+}
+
+int64_t
+__hsail_atomic_max_s64 (int64_t *ptr, int64_t a)
+{
+ DO_ATOMICALLY (int64_t, (old_value > a) ? old_value : a);
+}
+
+uint32_t
+__hsail_atomic_wrapinc_u32 (uint32_t *ptr, uint32_t a)
+{
+ DO_ATOMICALLY (uint32_t, (old_value >= a) ? 0 : (old_value + 1));
+}
+
+uint64_t
+__hsail_atomic_wrapinc_u64 (uint64_t *ptr, uint64_t a)
+{
+ DO_ATOMICALLY (uint64_t, (old_value >= a) ? 0 : (old_value + 1));
+}
+
+uint32_t
+__hsail_atomic_wrapdec_u32 (uint32_t *ptr, uint32_t a)
+{
+ DO_ATOMICALLY (uint32_t,
+ ((old_value == 0) || (old_value > a)) ? a : (old_value - 1));
+}
+
+uint64_t
+__hsail_atomic_wrapdec_u64 (uint64_t *ptr, uint64_t a)
+{
+ DO_ATOMICALLY (uint64_t,
+ ((old_value == 0) || (old_value > a)) ? a : (old_value - 1));
+}
diff --git a/libhsail-rt/rt/bitstring.c b/libhsail-rt/rt/bitstring.c
new file mode 100644
index 00000000000..44d9b0a776b
--- /dev/null
+++ b/libhsail-rt/rt/bitstring.c
@@ -0,0 +1,190 @@
+/* bitstring.c -- Builtins for HSAIL bitstring instructions.
+
+ Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
+ for General Processor Tech.
+
+ Permission is hereby granted, free of charge, to any person obtaining a
+ copy of this software and associated documentation files
+ (the "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be included
+ in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include <stdint.h>
+#include <limits.h>
+
+#define BITEXTRACT(DEST_TYPE, SRC0, SRC1, SRC2) \
+ uint32_t offset = SRC1 & (sizeof (DEST_TYPE) * 8 - 1); \
+ uint32_t width = SRC2 & (sizeof (DEST_TYPE) * 8 - 1); \
+ if (width == 0) \
+ return 0; \
+ else \
+ return (SRC0 << (sizeof (DEST_TYPE) * 8 - width - offset)) \
+ >> (sizeof (DEST_TYPE) * 8 - width)
+
+uint32_t
+__hsail_bitextract_u32 (uint32_t src0, uint32_t src1, uint32_t src2)
+{
+ BITEXTRACT (uint32_t, src0, src1, src2);
+}
+
+int32_t
+__hsail_bitextract_s32 (int32_t src0, uint32_t src1, uint32_t src2)
+{
+ BITEXTRACT (int32_t, src0, src1, src2);
+}
+
+uint64_t
+__hsail_bitextract_u64 (uint64_t src0, uint32_t src1, uint32_t src2)
+{
+ BITEXTRACT (uint64_t, src0, src1, src2);
+}
+
+int64_t
+__hsail_bitextract_s64 (int64_t src0, uint32_t src1, uint32_t src2)
+{
+ BITEXTRACT (int64_t, src0, src1, src2);
+}
+
+#define BITINSERT(DEST_TYPE, SRC0, SRC1, SRC2, SRC3) \
+ uint32_t offset = SRC2 & (sizeof (DEST_TYPE) * 8 - 1); \
+ uint32_t width = SRC3 & (sizeof (DEST_TYPE) * 8 - 1); \
+ DEST_TYPE mask = ((DEST_TYPE) 1 << width) - 1; \
+ return (SRC0 & ~(mask << offset)) | ((SRC1 & mask) << offset)
+
+uint32_t
+__hsail_bitinsert_u32 (uint32_t src0, uint32_t src1, uint32_t src2,
+ uint32_t src3)
+{
+ BITINSERT (uint32_t, src0, src1, src2, src3);
+}
+
+int64_t
+__hsail_bitinsert_u64 (uint64_t src0, uint64_t src1, uint32_t src2,
+ uint32_t src3)
+{
+ BITINSERT (uint64_t, src0, src1, src2, src3);
+}
+
+#define BITMASK(DEST_TYPE, SRC0, SRC1) \
+ uint32_t offset = SRC0 & (sizeof (DEST_TYPE) * 8 - 1); \
+ uint32_t width = SRC1 & (sizeof (DEST_TYPE) * 8 - 1); \
+ DEST_TYPE mask = ((DEST_TYPE) 1 << width) - 1; \
+ return mask << offset
+
+uint32_t
+__hsail_bitmask_u32 (uint32_t src0, uint32_t src1)
+{
+ BITMASK (uint32_t, src0, src1);
+}
+
+uint64_t
+__hsail_bitmask_u64 (uint32_t src0, uint32_t src1)
+{
+ BITMASK (uint64_t, src0, src1);
+}
+
+/* The dummy, but readable version from
+ http://graphics.stanford.edu/~seander/bithacks.html#BitReverseObvious
+ This (also) often maps to a single instruction in DSPs. */
+
+#define BITREV(DEST_TYPE, SRC) \
+ DEST_TYPE v = SRC; \
+ DEST_TYPE r = v; \
+ int s = sizeof (SRC) * CHAR_BIT - 1; \
+ \
+ for (v >>= 1; v; v >>= 1) \
+ { \
+ r <<= 1; \
+ r |= v & 1; \
+ s--; \
+ } \
+ return r << s
+
+uint32_t
+__hsail_bitrev_u32 (uint32_t src0)
+{
+ BITREV (uint32_t, src0);
+}
+
+uint64_t
+__hsail_bitrev_u64 (uint64_t src0)
+{
+ BITREV (uint64_t, src0);
+}
+
+uint32_t
+__hsail_bitselect_u32 (uint32_t src0, uint32_t src1, uint32_t src2)
+{
+ return (src1 & src0) | (src2 & ~src0);
+}
+
+uint64_t
+__hsail_bitselect_u64 (uint64_t src0, uint64_t src1, uint64_t src2)
+{
+ return (src1 & src0) | (src2 & ~src0);
+}
+
+/* Due to the defined behavior with 0, we cannot use the gcc builtin
+ __builtin_clz* () directly. __builtin_ffs () has defined behavior, but
+ returns 0 while HSAIL requires to return -1. */
+
+uint32_t
+__hsail_firstbit_u32 (uint32_t src0)
+{
+ if (src0 == 0)
+ return -1;
+ return __builtin_clz (src0);
+}
+
+uint32_t
+__hsail_firstbit_s32 (int32_t src0)
+{
+ uint32_t converted = src0 >= 0 ? src0 : ~src0;
+ return __hsail_firstbit_u32 (converted);
+}
+
+uint32_t
+__hsail_firstbit_u64 (uint64_t src0)
+{
+ if (src0 == 0)
+ return -1;
+ return __builtin_clzl (src0);
+}
+
+uint32_t
+__hsail_firstbit_s64 (int64_t src0)
+{
+ uint64_t converted = src0 >= 0 ? src0 : ~src0;
+ return __hsail_firstbit_u64 (converted);
+}
+
+uint32_t
+__hsail_lastbit_u32 (uint32_t src0)
+{
+ if (src0 == 0)
+ return -1;
+ return __builtin_ctz (src0);
+}
+
+uint32_t
+__hsail_lastbit_u64 (uint64_t src0)
+{
+ if (src0 == 0)
+ return -1;
+ return __builtin_ctzl (src0);
+}
diff --git a/libhsail-rt/rt/fbarrier.c b/libhsail-rt/rt/fbarrier.c
new file mode 100644
index 00000000000..608cec9227c
--- /dev/null
+++ b/libhsail-rt/rt/fbarrier.c
@@ -0,0 +1,87 @@
+/* fbarrier.c -- HSAIL fbarrier built-ins.
+
+ Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
+ for General Processor Tech.
+
+ Permission is hereby granted, free of charge, to any person obtaining a
+ copy of this software and associated documentation files
+ (the "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be included
+ in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include <stdlib.h>
+#include <signal.h>
+
+#include "workitems.h"
+#include "phsa-rt.h"
+
+#ifdef HAVE_FIBERS
+#include "fibers.h"
+
+typedef fiber_barrier_t fbarrier;
+
+void
+__hsail_initfbar (uint32_t addr, PHSAWorkItem *wi)
+{
+ fbarrier *fbar = (fbarrier *) (wi->wg->group_base_ptr + addr);
+ fbar->threshold = 0;
+ fbar->reached = 0;
+ fbar->waiting_count = 0;
+}
+
+void
+__hsail_releasefbar (uint32_t addr, PHSAWorkItem *wi)
+{
+ fbarrier *fbar = (fbarrier *) (wi->wg->group_base_ptr + addr);
+ fbar->threshold = 0;
+ fbar->reached = 0;
+ fbar->waiting_count = 0;
+}
+
+void
+__hsail_joinfbar (uint32_t addr, PHSAWorkItem *wi)
+{
+ fbarrier *fbar = (fbarrier *) (wi->wg->group_base_ptr + addr);
+ ++fbar->threshold;
+}
+
+void
+__hsail_leavefbar (uint32_t addr, PHSAWorkItem *wi)
+{
+ fbarrier *fbar = (fbarrier *) (wi->wg->group_base_ptr + addr);
+ --fbar->threshold;
+}
+
+void
+__hsail_waitfbar (uint32_t addr, PHSAWorkItem *wi)
+{
+ fbarrier *fbar = (fbarrier *) (wi->wg->group_base_ptr + addr);
+ fiber_barrier_reach (fbar);
+}
+
+void
+__hsail_arrivefbar (uint32_t addr, PHSAWorkItem *wi)
+{
+ fbarrier *fbar = (fbarrier *) (wi->wg->group_base_ptr + addr);
+ ++fbar->reached;
+ if (fbar->reached == fbar->threshold)
+ fbar->reached = 0;
+}
+
+#endif
+
diff --git a/libhsail-rt/rt/fibers.c b/libhsail-rt/rt/fibers.c
new file mode 100644
index 00000000000..a3056a92cde
--- /dev/null
+++ b/libhsail-rt/rt/fibers.c
@@ -0,0 +1,220 @@
+/* fibers.c -- extremely simple lightweight thread (fiber) implementation
+ Copyright (C) 2016 Free Software Foundation, Inc.
+ Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
+ for General Processor Tech.
+
+ Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
+ for General Processor Tech.
+
+ Permission is hereby granted, free of charge, to any person obtaining a
+ copy of this software and associated documentation files
+ (the "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be included
+ in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+
+#include "target-config.h"
+
+#include "fibers.h"
+
+void
+phsa_fatal_error (int code);
+
+ucontext_t main_context;
+
+/* The last fiber in the linked list. */
+static fiber_t *tail_fiber = NULL;
+/* The first fiber in the linked list. */
+static fiber_t *head_fiber = NULL;
+/* The fiber currently being executed. */
+static fiber_t *current_fiber = NULL;
+
+/* Makecontext accepts only integer arguments. We need to split the
+ pointer argument in case pointer does not fit into int. This helper
+ function can be used to restore the pointer from the arguments. */
+
+void *
+fiber_int_args_to_ptr (int arg0, int arg1)
+{
+ void *ptr = NULL;
+#if SIZEOF_VOIDP == 8 && SIZEOF_INT == 4
+ ptr = (void*)(((uint64_t) arg0 & (uint64_t) 0xFFFFFFFF)
+ | ((uint64_t) arg1 << 32));
+#elif SIZEOF_VOIDP == 4 && SIZEOF_INT == 4
+ ptr = (void*)arg0;
+#else
+# error Unsupported pointer/int size.
+#endif
+ return ptr;
+}
+
+void
+fiber_init (fiber_t *fiber, fiber_function_t start_function, void *arg,
+ size_t stack_size, size_t stack_align)
+{
+ int arg0, arg1;
+ if (getcontext (&fiber->context) != 0)
+ phsa_fatal_error (3);
+ if (posix_memalign (&fiber->context.uc_stack.ss_sp, stack_align, stack_size)
+ != 0)
+ phsa_fatal_error (4);
+ fiber->context.uc_stack.ss_size = stack_size;
+ fiber->context.uc_link = &main_context;
+
+ /* makecontext () accepts only integer arguments. Split the
+ pointer argument to two args in the case pointer does not fit
+ into one int. */
+#if SIZEOF_VOIDP == 8 && SIZEOF_INT == 4
+ arg0 = (int32_t) 0xFFFFFFFF & (uint64_t)arg;
+ arg1 = (int32_t) 0xFFFFFFFF & ((uint64_t)arg >> 32);
+#elif SIZEOF_VOIDP == 4 && SIZEOF_INT == 4
+ arg0 = (int)arg;
+ arg1 = 0;
+#else
+# error Unsupported pointer/int size.
+#endif
+
+ makecontext (&fiber->context, (void*)start_function, 2, arg0, arg1);
+
+ fiber->status = FIBER_STATUS_READY;
+ fiber->next = NULL;
+ fiber->prev = NULL;
+
+ /* Create a linked list of the created fibers. Append the new one at
+ the end. */
+ if (tail_fiber == NULL)
+ tail_fiber = fiber;
+ else
+ {
+ tail_fiber->next = fiber;
+ fiber->prev = tail_fiber;
+ tail_fiber = fiber;
+ }
+
+ if (head_fiber == NULL)
+ head_fiber = fiber;
+}
+
+void
+fiber_exit ()
+{
+ fiber_status_t old_status = current_fiber->status;
+ current_fiber->status = FIBER_STATUS_EXITED;
+ if (old_status == FIBER_STATUS_JOINED)
+ /* In case this thread has been joined, return back to the joiner. */
+ swapcontext (&current_fiber->context, &main_context);
+ else
+ /* In case the thread exited while being yielded from another thread,
+ switch back to another fiber. */
+ fiber_yield ();
+}
+
+void
+fiber_join (fiber_t *fiber)
+{
+ fiber_t *next_ready_fiber = NULL;
+ current_fiber = fiber;
+ if (fiber->status != FIBER_STATUS_EXITED)
+ {
+ fiber->status = FIBER_STATUS_JOINED;
+ while (fiber->status != FIBER_STATUS_EXITED)
+ swapcontext (&main_context, &fiber->context);
+ }
+
+ /* Remove the successfully joined fiber from the linked list so we won't
+ access it later (the fiber itself might be freed after the join). */
+ if (fiber->prev != NULL)
+ fiber->prev->next = fiber->next;
+
+ if (fiber->next != NULL)
+ fiber->next->prev = fiber->prev;
+
+ if (head_fiber == fiber)
+ head_fiber = fiber->next;
+
+ if (tail_fiber == fiber)
+ tail_fiber = fiber->prev;
+
+ free (fiber->context.uc_stack.ss_sp);
+}
+
+void
+fiber_yield ()
+{
+ fiber_t *next_ready_fiber = current_fiber;
+
+ if (current_fiber == head_fiber
+ && current_fiber == tail_fiber)
+ {
+ /* If the last fiber exits independently, there is no
+ fiber to switch to. Switch to the main context in that
+ case. */
+ if (current_fiber->status == FIBER_STATUS_EXITED)
+ swapcontext (&current_fiber->context, &main_context);
+ }
+
+ do {
+ next_ready_fiber = next_ready_fiber->next != NULL
+ ? next_ready_fiber->next : head_fiber;
+ } while (next_ready_fiber != current_fiber
+ && next_ready_fiber->status == FIBER_STATUS_EXITED);
+
+ fiber_t *old_current_fiber = current_fiber;
+ current_fiber = next_ready_fiber;
+ swapcontext (&old_current_fiber->context, &next_ready_fiber->context);
+}
+
+size_t
+fiber_barrier_reach (fiber_barrier_t *barrier)
+{
+ /* Yield once to ensure that there are no fibers waiting for
+ a previous triggering of the barrier in the waiting_count
+ loop. This should release them before we update the reached
+ counter again. */
+ fiber_yield ();
+
+ barrier->reached++;
+ ++barrier->waiting_count;
+ while (barrier->reached < barrier->threshold)
+ fiber_yield ();
+ --barrier->waiting_count;
+
+ /* Wait until all the fibers have reached this point. */
+ while (barrier->waiting_count > 0)
+ fiber_yield ();
+
+ /* Now all fibers have been released from the barrier waiting
+ loop. We can now safely reset the reach count for new triggering. */
+ if (barrier->reached > 0)
+ {
+ barrier->reached = 0;
+ return 0;
+ }
+ return 1;
+}
+
+void
+fiber_barrier_init (fiber_barrier_t *barrier, size_t threshold)
+{
+ barrier->threshold = threshold;
+ barrier->waiting_count = 0;
+ barrier->reached = 0;
+}
diff --git a/libhsail-rt/rt/fp16.c b/libhsail-rt/rt/fp16.c
new file mode 100644
index 00000000000..01cb1e05103
--- /dev/null
+++ b/libhsail-rt/rt/fp16.c
@@ -0,0 +1,135 @@
+/* Half-float conversion routines. Code mostly borrowed from the ARM's
+ builtin function.
+
+ Copyright (C) 2008-2015 Free Software Foundation, Inc.
+ Contributed by CodeSourcery.
+
+ This file is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 3, or (at your option) any
+ later version.
+
+ This file is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+static inline unsigned short
+__gnu_f2h_internal (unsigned int a, int ieee)
+{
+ unsigned short sign = (a >> 16) & 0x8000;
+ int aexp = (a >> 23) & 0xff;
+ unsigned int mantissa = a & 0x007fffff;
+ unsigned int mask;
+ unsigned int increment;
+
+ if (aexp == 0xff)
+ {
+ if (!ieee)
+ return sign;
+ if (mantissa == 0)
+ return sign | 0x7c00; /* Infinity. */
+ /* Remaining cases are NaNs. Convert SNaN to QNaN. */
+ return sign | 0x7e00 | (mantissa >> 13);
+ }
+
+ if (aexp == 0 && mantissa == 0)
+ return sign;
+
+ aexp -= 127;
+
+ /* Decimal point between bits 22 and 23. */
+ mantissa |= 0x00800000;
+ if (aexp < -14)
+ {
+ mask = 0x00ffffff;
+ if (aexp >= -25)
+ mask >>= 25 + aexp;
+ }
+ else
+ mask = 0x00001fff;
+
+ /* Round. */
+ if (mantissa & mask)
+ {
+ increment = (mask + 1) >> 1;
+ if ((mantissa & mask) == increment)
+ increment = mantissa & (increment << 1);
+ mantissa += increment;
+ if (mantissa >= 0x01000000)
+ {
+ mantissa >>= 1;
+ aexp++;
+ }
+ }
+
+ if (ieee)
+ {
+ if (aexp > 15)
+ return sign | 0x7c00;
+ }
+ else
+ {
+ if (aexp > 16)
+ return sign | 0x7fff;
+ }
+
+ if (aexp < -24)
+ return sign;
+
+ if (aexp < -14)
+ {
+ mantissa >>= -14 - aexp;
+ aexp = -14;
+ }
+
+ /* We leave the leading 1 in the mantissa, and subtract one
+ from the exponent bias to compensate. */
+ return sign | (((aexp + 14) << 10) + (mantissa >> 13));
+}
+
+static unsigned int
+__gnu_h2f_internal (unsigned short a, int ieee)
+{
+ unsigned int sign = (unsigned int) (a & 0x8000) << 16;
+ int aexp = (a >> 10) & 0x1f;
+ unsigned int mantissa = a & 0x3ff;
+
+ if (aexp == 0x1f && ieee)
+ return sign | 0x7f800000 | (mantissa << 13);
+
+ if (aexp == 0)
+ {
+ int shift;
+
+ if (mantissa == 0)
+ return sign;
+
+ shift = __builtin_clz (mantissa) - 21;
+ mantissa <<= shift;
+ aexp = -shift;
+ }
+
+ return sign | (((aexp + 0x70) << 23) + (mantissa << 13));
+}
+
+unsigned short
+__hsail_f32_to_f16 (unsigned int a)
+{
+ return __gnu_f2h_internal (a, 1);
+}
+
+unsigned int
+__hsail_f16_to_f32 (unsigned short a)
+{
+ return __gnu_h2f_internal (a, 1);
+}
diff --git a/libhsail-rt/rt/misc.c b/libhsail-rt/rt/misc.c
new file mode 100644
index 00000000000..547734e3e61
--- /dev/null
+++ b/libhsail-rt/rt/misc.c
@@ -0,0 +1,89 @@
+/* misc.c -- Builtins for HSAIL misc instructions.
+
+ Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
+ for General Processor Tech.
+
+ Permission is hereby granted, free of charge, to any person obtaining a
+ copy of this software and associated documentation files
+ (the "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be included
+ in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include <stdint.h>
+#include <time.h>
+
+#include "workitems.h"
+
+/* Return the monotonic clock as nanoseconds. */
+
+uint64_t
+__hsail_clock ()
+{
+ struct timespec t;
+ clock_gettime (CLOCK_MONOTONIC, &t);
+ return (uint64_t) t.tv_sec * 1000000000 + (uint64_t) t.tv_nsec;
+}
+
+uint32_t
+__hsail_cuid (PHSAWorkItem *wi)
+{
+ /* All WIs are executed with a single compute unit (core/thread)
+ for now. */
+ return 0;
+}
+
+uint32_t
+__hsail_maxcuid (PHSAWorkItem *wi)
+{
+ /* All WIs are executed with a single compute unit (core/thread)
+ for now. */
+ return 0;
+}
+
+void
+__hsail_debugtrap (uint32_t src, PHSAWorkItem *wi)
+{
+ /* Could we produce a SIGTRAP signal here to drop to gdb
+ console, or similar? In any case, the execution of the
+ kernel should halt.
+ */
+ return;
+}
+
+uint32_t
+__hsail_groupbaseptr (PHSAWorkItem *wi)
+{
+ return (uint32_t) (uint64_t) (wi->wg->group_base_ptr
+ - wi->launch_data->group_segment_start_addr);
+}
+
+uint64_t
+__hsail_kernargbaseptr_u64 (PHSAWorkItem *wi)
+{
+ /* For now assume only a single kernarg allocation at a time.
+ Proper kernarg memory management to do. */
+ return (uint64_t) wi->launch_data->kernarg_addr;
+}
+
+uint32_t
+__hsail_kernargbaseptr_u32 (PHSAWorkItem *wi)
+{
+ /* For now assume only a single kernarg allocation at a time.
+ Proper kernarg memory management to do. */
+ return 0;
+}
diff --git a/libhsail-rt/rt/multimedia.c b/libhsail-rt/rt/multimedia.c
new file mode 100644
index 00000000000..31125ed2933
--- /dev/null
+++ b/libhsail-rt/rt/multimedia.c
@@ -0,0 +1,135 @@
+/* multimedia.c -- Builtins for HSAIL multimedia instructions.
+
+ Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
+ for General Processor Tech.
+
+ Permission is hereby granted, free of charge, to any person obtaining a
+ copy of this software and associated documentation files
+ (the "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be included
+ in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include <math.h>
+#include <stdint.h>
+
+uint32_t
+__hsail_bitalign (uint64_t lower, uint64_t upper, uint32_t shift_amount)
+{
+ shift_amount = shift_amount & 31;
+ uint64_t packed_value = (upper << 32) | lower;
+ return (packed_value >> shift_amount) & 0xFFFFFFFF;
+}
+
+uint32_t
+__hsail_bytealign (uint64_t lower, uint64_t upper, uint32_t shift_amount)
+{
+ shift_amount = (shift_amount & 3) * 8;
+ uint64_t packed_value = (upper << 32) | lower;
+ return (packed_value >> shift_amount) & 0xFFFFFFFF;
+}
+
+uint32_t
+__hsail_lerp (uint32_t a, uint32_t b, uint32_t c)
+{
+ uint32_t e3
+ = (((((a >> 24) & 0xff) + ((b >> 24) & 0xff) + ((c >> 24) & 0x1)) / 2)
+ & 0xff)
+ << 24;
+ uint32_t e2
+ = (((((a >> 16) & 0xff) + ((b >> 16) & 0xff) + ((c >> 16) & 0x1)) / 2)
+ & 0xff)
+ << 16;
+ uint32_t e1
+ = (((((a >> 8) & 0xff) + ((b >> 8) & 0xff) + ((c >> 8) & 0x1)) / 2) & 0xff)
+ << 8;
+ uint32_t e0 = (((a & 0xff) + (b & 0xff) + (c & 0x1)) / 2) & 0xff;
+
+ return e3 | e2 | e1 | e0;
+}
+
+static uint8_t
+cvt_neari_sat_u8_f32 (float a)
+{
+ if (isinf (a))
+ {
+ if (signbit (a)) return 0;
+ else return 255;
+ }
+ else if (isnan (a)) return 0;
+ else if (a < 0.0)
+ return 0;
+ else if (a > 255.0)
+ return 255;
+ else
+ return (uint8_t) a;
+}
+
+uint32_t
+__hsail_packcvt (float a, float b, float c, float d)
+{
+ return (uint32_t) cvt_neari_sat_u8_f32 (a)
+ | (uint32_t) cvt_neari_sat_u8_f32 (b) << 8
+ | (uint32_t) cvt_neari_sat_u8_f32 (c) << 16
+ | (uint32_t) cvt_neari_sat_u8_f32 (d) << 24;
+}
+
+float
+__hsail_unpackcvt (uint32_t val, uint32_t index)
+{
+ return (float) ((val >> (index * 8)) & 0xff);
+}
+
+static uint32_t
+abs_diff (uint32_t a, uint32_t b)
+{
+ if (a < b)
+ return b - a;
+ else
+ return a - b;
+}
+
+uint32_t
+__hsail_sad_u8x4 (uint32_t a, uint32_t b, uint32_t add)
+{
+ return abs_diff ((a >> 24) & 0xff, (b >> 24) & 0xff)
+ + abs_diff ((a >> 16) & 0xff, (b >> 16) & 0xff)
+ + abs_diff ((a >> 8) & 0xff, (b >> 8) & 0xff)
+ + abs_diff ((a >> 0) & 0xff, (b >> 0) & 0xff) + add;
+}
+
+uint32_t
+__hsail_sad_u16x2 (uint32_t a, uint32_t b, uint32_t add)
+{
+ return abs_diff ((a >> 16) & 0xffff, (b >> 16) & 0xffff)
+ + abs_diff ((a >> 0) & 0xffff, (b >> 0) & 0xffff) + add;
+}
+
+uint32_t
+__hsail_sad_u32 (uint32_t a, uint32_t b, uint32_t add)
+{
+ return abs_diff (a, b) + add;
+}
+
+uint32_t
+__hsail_sadhi_u16x2_u8x4 (uint32_t a, uint32_t b, uint32_t add)
+{
+ return (abs_diff ((a >> 24) & 0xff, (b >> 24) & 0xff) << 16)
+ + (abs_diff ((a >> 16) & 0xff, (b >> 16) & 0xff) << 16)
+ + (abs_diff ((a >> 8) & 0xff, (b >> 8) & 0xff) << 16)
+ + (abs_diff ((a >> 0) & 0xff, (b >> 0) & 0xff) << 16) + add;
+}
diff --git a/libhsail-rt/rt/queue.c b/libhsail-rt/rt/queue.c
new file mode 100644
index 00000000000..3d7ff7616b3
--- /dev/null
+++ b/libhsail-rt/rt/queue.c
@@ -0,0 +1,71 @@
+/* queue.c -- Builtins for HSAIL queue related instructions.
+
+ Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
+ for General Processor Tech.
+
+ Permission is hereby granted, free of charge, to any person obtaining a
+ copy of this software and associated documentation files
+ (the "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be included
+ in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include "phsa-queue-interface.h"
+
+uint64_t
+__hsail_ldqueuereadindex (uint64_t queue_addr)
+{
+ phsa_queue_t *queue = (phsa_queue_t *) queue_addr;
+ return queue->read_index;
+}
+
+uint64_t
+__hsail_ldqueuewriteindex (uint64_t queue_addr)
+{
+ phsa_queue_t *queue = (phsa_queue_t *) queue_addr;
+ return queue->write_index;
+}
+
+uint64_t
+__hsail_addqueuewriteindex (uint64_t queue_addr, uint64_t value)
+{
+ phsa_queue_t *queue = (phsa_queue_t *) queue_addr;
+ return __sync_fetch_and_add (&queue->write_index, value);
+}
+
+uint64_t
+__hsail_casqueuewriteindex (uint64_t queue_addr, uint64_t cmp_value,
+ uint64_t new_value)
+{
+ phsa_queue_t *queue = (phsa_queue_t *) queue_addr;
+ return __sync_val_compare_and_swap (&queue->write_index, cmp_value,
+ new_value);
+}
+
+void
+__hsail_stqueuereadindex (uint64_t queue_addr, uint64_t value)
+{
+ phsa_queue_t *queue = (phsa_queue_t *) queue_addr;
+ queue->read_index = value;
+}
+
+void
+__hsail_stqueuewriteindex (uint64_t queue_addr, uint64_t value)
+{
+ phsa_queue_t *queue = (phsa_queue_t *) queue_addr;
+ queue->write_index = value;
+}
diff --git a/libhsail-rt/rt/sat_arithmetic.c b/libhsail-rt/rt/sat_arithmetic.c
new file mode 100644
index 00000000000..3e4024591ae
--- /dev/null
+++ b/libhsail-rt/rt/sat_arithmetic.c
@@ -0,0 +1,299 @@
+/* sat_arithmetic.c -- Builtins for HSAIL saturating arithmetic instructions.
+
+ Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
+ for General Processor Tech.
+
+ Permission is hereby granted, free of charge, to any person obtaining a
+ copy of this software and associated documentation files
+ (the "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be included
+ in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include <stdint.h>
+
+uint8_t
+__hsail_sat_add_u8 (uint8_t a, uint8_t b)
+{
+ uint16_t c = (uint16_t) a + (uint16_t) b;
+ if (c > UINT8_MAX)
+ return UINT8_MAX;
+ else
+ return c;
+}
+
+uint16_t
+__hsail_sat_add_u16 (uint16_t a, uint16_t b)
+{
+ uint32_t c = (uint32_t) a + (uint32_t) b;
+ if (c > UINT16_MAX)
+ return UINT16_MAX;
+ else
+ return c;
+}
+
+uint32_t
+__hsail_sat_add_u32 (uint32_t a, uint32_t b)
+{
+ uint64_t c = (uint64_t) a + (uint64_t) b;
+ if (c > UINT32_MAX)
+ return UINT32_MAX;
+ else
+ return c;
+}
+
+uint64_t
+__hsail_sat_add_u64 (uint64_t a, uint64_t b)
+{
+ __uint128_t c = (__uint128_t) a + (__uint128_t) b;
+ if (c > UINT64_MAX)
+ return UINT64_MAX;
+ else
+ return c;
+}
+
+int8_t
+__hsail_sat_add_s8 (int8_t a, int8_t b)
+{
+ int16_t c = (int16_t) a + (int16_t) b;
+ if (c > INT8_MAX)
+ return INT8_MAX;
+ else if (c < INT8_MIN)
+ return INT8_MIN;
+ else
+ return c;
+}
+
+int16_t
+__hsail_sat_add_s16 (int16_t a, int16_t b)
+{
+ int32_t c = (int32_t) a + (int32_t) b;
+ if (c > INT16_MAX)
+ return INT16_MAX;
+ else if (c < INT16_MIN)
+ return INT16_MIN;
+ else
+ return c;
+}
+
+int32_t
+__hsail_sat_add_s32 (int32_t a, int32_t b)
+{
+ int64_t c = (int64_t) a + (int64_t) b;
+ if (c > INT32_MAX)
+ return INT32_MAX;
+ else if (c < INT32_MIN)
+ return INT32_MIN;
+ else
+ return c;
+}
+
+int64_t
+__hsail_sat_add_s64 (int64_t a, int64_t b)
+{
+ __int128_t c = (__int128_t) a + (__int128_t) b;
+ if (c > INT64_MAX)
+ return INT64_MAX;
+ else if (c < INT64_MIN)
+ return INT64_MIN;
+ else
+ return c;
+}
+
+uint8_t
+__hsail_sat_sub_u8 (uint8_t a, uint8_t b)
+{
+ int16_t c = (uint16_t) a - (uint16_t) b;
+ if (c < 0)
+ return 0;
+ else if (c > UINT8_MAX)
+ return UINT8_MAX;
+ else
+ return c;
+}
+
+uint16_t
+__hsail_sat_sub_u16 (uint16_t a, uint16_t b)
+{
+ int32_t c = (uint32_t) a - (uint32_t) b;
+ if (c < 0)
+ return 0;
+ else if (c > UINT16_MAX)
+ return UINT16_MAX;
+ else
+ return c;
+}
+
+uint32_t
+__hsail_sat_sub_u32 (uint32_t a, uint32_t b)
+{
+ int64_t c = (uint64_t) a - (uint64_t) b;
+ if (c < 0)
+ return 0;
+ else if (c > UINT32_MAX)
+ return UINT32_MAX;
+ else
+ return c;
+}
+
+uint64_t
+__hsail_sat_sub_u64 (uint64_t a, uint64_t b)
+{
+ __int128_t c = (__uint128_t) a - (__uint128_t) b;
+ if (c < 0)
+ return 0;
+ else if (c > UINT64_MAX)
+ return UINT64_MAX;
+ else
+ return c;
+}
+
+int8_t
+__hsail_sat_sub_s8 (int8_t a, int8_t b)
+{
+ int16_t c = (int16_t) a - (int16_t) b;
+ if (c > INT8_MAX)
+ return INT8_MAX;
+ else if (c < INT8_MIN)
+ return INT8_MIN;
+ else
+ return c;
+}
+
+int16_t
+__hsail_sat_sub_s16 (int16_t a, int16_t b)
+{
+ int32_t c = (int32_t) a - (int32_t) b;
+ if (c > INT16_MAX)
+ return INT16_MAX;
+ else if (c < INT16_MIN)
+ return INT16_MIN;
+ else
+ return c;
+}
+
+int32_t
+__hsail_sat_sub_s32 (int32_t a, int32_t b)
+{
+ int64_t c = (int64_t) a - (int64_t) b;
+ if (c > INT32_MAX)
+ return INT32_MAX;
+ else if (c < INT32_MIN)
+ return INT32_MIN;
+ else
+ return c;
+}
+
+int64_t
+__hsail_sat_sub_s64 (int64_t a, int64_t b)
+{
+ __int128_t c = (__int128_t) a - (__int128_t) b;
+ if (c > INT64_MAX)
+ return INT64_MAX;
+ else if (c < INT64_MIN)
+ return INT64_MIN;
+ else
+ return c;
+}
+
+uint8_t
+__hsail_sat_mul_u8 (uint8_t a, uint8_t b)
+{
+ uint16_t c = (uint16_t) a * (uint16_t) b;
+ if (c > UINT8_MAX)
+ return UINT8_MAX;
+ else
+ return c;
+}
+
+uint16_t
+__hsail_sat_mul_u16 (uint16_t a, uint16_t b)
+{
+ uint32_t c = (uint32_t) a * (uint32_t) b;
+ if (c > UINT16_MAX)
+ return UINT16_MAX;
+ else
+ return c;
+}
+
+uint32_t
+__hsail_sat_mul_u32 (uint32_t a, uint32_t b)
+{
+ uint64_t c = (uint64_t) a * (uint64_t) b;
+ if (c > UINT32_MAX)
+ return UINT32_MAX;
+ else
+ return c;
+}
+
+uint64_t
+__hsail_sat_mul_u64 (uint64_t a, uint64_t b)
+{
+ __uint128_t c = (__uint128_t) a * (__uint128_t) b;
+ if (c > UINT64_MAX)
+ return UINT64_MAX;
+ else
+ return c;
+}
+
+int8_t
+__hsail_sat_mul_s8 (int8_t a, int8_t b)
+{
+ int16_t c = (int16_t) a * (int16_t) b;
+ if (c > INT8_MAX)
+ return INT8_MAX;
+ else if (c < INT8_MIN)
+ return INT8_MIN;
+ else
+ return c;
+}
+
+int16_t
+__hsail_sat_mul_s16 (int16_t a, int16_t b)
+{
+ int32_t c = (int32_t) a * (int32_t) b;
+ if (c > INT16_MAX)
+ return INT16_MAX;
+ else if (c < INT16_MIN)
+ return INT16_MIN;
+ else
+ return c;
+}
+
+int32_t
+__hsail_sat_mul_s32 (int32_t a, int32_t b)
+{
+ int64_t c = (int64_t) a * (int64_t) b;
+ if (c > INT32_MAX)
+ return INT32_MAX;
+ else if (c < INT32_MIN)
+ return INT32_MIN;
+ else
+ return c;
+}
+
+int64_t
+__hsail_sat_mul_s64 (int64_t a, int64_t b)
+{
+ __int128_t c = (__int128_t) a * (__int128_t) b;
+ if (c > INT64_MAX)
+ return INT64_MAX;
+ else if (c < INT64_MIN)
+ return INT64_MIN;
+ else
+ return c;
+}
diff --git a/libhsail-rt/rt/segment.c b/libhsail-rt/rt/segment.c
new file mode 100644
index 00000000000..a1d2c843b81
--- /dev/null
+++ b/libhsail-rt/rt/segment.c
@@ -0,0 +1,57 @@
+/* segment.c -- Builtins for HSAIL segment related instructions.
+
+ Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
+ for General Processor Tech.
+
+ Permission is hereby granted, free of charge, to any person obtaining a
+ copy of this software and associated documentation files
+ (the "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be included
+ in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include "workitems.h"
+
+uint32_t
+__hsail_segmentp_private (uint64_t flat_addr, PHSAWorkItem *wi)
+{
+ if (flat_addr == 0)
+ return 1;
+ else
+ return (void *) flat_addr >= wi->wg->private_base_ptr
+ && (void *) flat_addr
+ < wi->wg->private_base_ptr + wi->wg->private_segment_total_size;
+}
+
+uint32_t
+__hsail_segmentp_group (uint64_t flat_addr, PHSAWorkItem *wi)
+{
+ if (flat_addr == 0)
+ return 1;
+ else
+ return (void *) flat_addr >= wi->wg->group_base_ptr
+ && (void *) flat_addr < wi->wg->group_base_ptr
+ + wi->launch_data->dp->group_segment_size;
+}
+
+uint32_t
+__hsail_segmentp_global (uint64_t flat_addr, PHSAWorkItem *wi)
+{
+ return (flat_addr == 0
+ || (!__hsail_segmentp_private (flat_addr, wi)
+ && !__hsail_segmentp_group (flat_addr, wi)));
+}
diff --git a/libhsail-rt/rt/workitems.c b/libhsail-rt/rt/workitems.c
new file mode 100644
index 00000000000..80bcaddd007
--- /dev/null
+++ b/libhsail-rt/rt/workitems.c
@@ -0,0 +1,952 @@
+/* workitems.c -- The main runtime entry that performs work-item execution in
+ various ways and the builtin functions closely related to the
+ implementation.
+
+ Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
+ for General Processor Tech.
+
+ Permission is hereby granted, free of charge, to any person obtaining a
+ copy of this software and associated documentation files
+ (the "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be included
+ in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+/* The fiber based multiple work-item work-group execution uses ucontext
+ based user mode threading. However, if gccbrig is able to optimize the
+ kernel to a much faster work-group function that implements the multiple
+ WI execution using loops instead of fibers requiring slow context switches,
+ the fiber-based implementation won't be called.
+ */
+
+#include <stdlib.h>
+#include <signal.h>
+#include <string.h>
+
+#include "workitems.h"
+#include "phsa-rt.h"
+
+#ifdef HAVE_FIBERS
+#include "fibers.h"
+#endif
+
+#ifdef BENCHMARK_PHSA_RT
+#include <stdio.h>
+#include <time.h>
+
+static uint64_t wi_count = 0;
+static uint64_t wis_skipped = 0;
+static uint64_t wi_total = 0;
+static clock_t start_time;
+
+#endif
+
+#ifdef DEBUG_PHSA_RT
+#include <stdio.h>
+#endif
+
+#define PRIVATE_SEGMENT_ALIGN 256
+#define FIBER_STACK_SIZE (64*1024)
+#define GROUP_SEGMENT_ALIGN 256
+
+/* HSA requires WGs to be executed in flat work-group id order. Enabling
+ the following macro can reveal test cases that rely on the ordering,
+ but is not useful for much else. */
+
+uint32_t __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context);
+
+uint32_t __hsail_workitemid (uint32_t dim, PHSAWorkItem *context);
+
+uint32_t __hsail_gridgroups (uint32_t dim, PHSAWorkItem *context);
+
+uint32_t __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi);
+
+uint32_t __hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi);
+
+void
+phsa_fatal_error (int code)
+{
+ exit (code);
+}
+
+#ifdef HAVE_FIBERS
+/* ucontext-based work-item thread implementation. Runs all work-items in
+ separate fibers. */
+
+static void
+phsa_work_item_thread (int arg0, int arg1)
+{
+ void *arg = fiber_int_args_to_ptr (arg0, arg1);
+
+ PHSAWorkItem *wi = (PHSAWorkItem *) arg;
+ volatile PHSAWorkGroup *wg = wi->wg;
+ PHSAKernelLaunchData *l_data = wi->launch_data;
+
+ do
+ {
+ int retcode
+ = fiber_barrier_reach ((fiber_barrier_t *) l_data->wg_start_barrier);
+
+ /* At this point the threads can assume that either more_wgs is 0 or
+ the current_work_group_* is set to point to the WG executed next. */
+ if (!wi->wg->more_wgs)
+ break;
+#ifdef DEBUG_PHSA_RT
+ printf (
+ "Running work-item %lu/%lu/%lu for wg %lu/%lu/%lu / %lu/%lu/%lu...\n",
+ wi->x, wi->y, wi->z, wg->x, wg->y, wg->z, l_data->wg_max_x,
+ l_data->wg_max_y, l_data->wg_max_z);
+#endif
+
+ if (wi->x < __hsail_currentworkgroupsize (0, wi)
+ && wi->y < __hsail_currentworkgroupsize (1, wi)
+ && wi->z < __hsail_currentworkgroupsize (2, wi))
+ {
+ l_data->kernel (l_data->kernarg_addr, wi, wg->group_base_ptr,
+ wg->private_base_ptr);
+#ifdef DEBUG_PHSA_RT
+ printf ("done.\n");
+#endif
+#ifdef BENCHMARK_PHSA_RT
+ wi_count++;
+#endif
+ }
+ else
+ {
+#ifdef DEBUG_PHSA_RT
+ printf ("skipped (partial WG).\n");
+#endif
+#ifdef BENCHMARK_PHSA_RT
+ wis_skipped++;
+#endif
+ }
+
+ retcode
+ = fiber_barrier_reach ((fiber_barrier_t *)
+ l_data->wg_completion_barrier);
+
+ /* The first thread updates the WG to execute next etc. */
+
+ if (retcode == 0)
+ {
+#ifdef EXECUTE_WGS_BACKWARDS
+ if (wg->x == l_data->wg_min_x)
+ {
+ wg->x = l_data->wg_max_x - 1;
+ if (wg->y == l_data->wg_min_y)
+ {
+ wg->y = l_data->wg_max_y - 1;
+ if (wg->z == l_data->wg_min_z)
+ wg->more_wgs = 0;
+ else
+ wg->z--;
+ }
+ else
+ wg->y--;
+ }
+ else
+ wg->x--;
+#else
+ if (wg->x + 1 >= l_data->wg_max_x)
+ {
+ wg->x = l_data->wg_min_x;
+ if (wg->y + 1 >= l_data->wg_max_y)
+ {
+ wg->y = l_data->wg_min_y;
+ if (wg->z + 1 >= l_data->wg_max_z)
+ wg->more_wgs = 0;
+ else
+ wg->z++;
+ }
+ else
+ wg->y++;
+ }
+ else
+ wg->x++;
+#endif
+
+ /* Reinitialize the work-group barrier according to the new WG's
+ size, which might not be the same as the previous ones, due
+ to "partial WGs". */
+ size_t wg_size = __hsail_currentworkgroupsize (0, wi)
+ * __hsail_currentworkgroupsize (1, wi)
+ * __hsail_currentworkgroupsize (2, wi);
+
+#ifdef DEBUG_PHSA_RT
+ printf ("Reinitializing the WG barrier to %lu.\n", wg_size);
+#endif
+ fiber_barrier_init ((fiber_barrier_t *)
+ wi->launch_data->wg_sync_barrier,
+ wg_size);
+
+#ifdef BENCHMARK_PHSA_RT
+ if (wi_count % 1000 == 0)
+ {
+ clock_t spent_time = clock () - start_time;
+ double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
+ double wis_per_sec = wi_count / spent_time_sec;
+ uint64_t eta_sec
+ = (wi_total - wi_count - wis_skipped) / wis_per_sec;
+
+ printf ("%lu WIs executed %lu skipped in %lus (%lu WIs/s, ETA in "
+ "%lu s)\n",
+ wi_count, wis_skipped, (uint64_t) spent_time_sec,
+ (uint64_t) wis_per_sec, (uint64_t) eta_sec);
+ }
+#endif
+ }
+ }
+ while (1);
+
+ fiber_exit ();
+}
+#endif
+
+#define MIN(a, b) ((a < b) ? a : b)
+#define MAX(a, b) ((a > b) ? a : b)
+
+#ifdef HAVE_FIBERS
+/* Spawns a given number of work-items to execute a set of work-groups,
+ blocks until their completion. */
+
+static void
+phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr,
+ size_t wg_size_x, size_t wg_size_y, size_t wg_size_z)
+{
+ PHSAWorkItem *wi_threads = NULL;
+ PHSAWorkGroup wg;
+ size_t flat_wi_id = 0, x, y, z, max_x, max_y, max_z;
+ fiber_barrier_t wg_start_barrier;
+ fiber_barrier_t wg_completion_barrier;
+ fiber_barrier_t wg_sync_barrier;
+
+ max_x = wg_size_x == 0 ? 1 : wg_size_x;
+ max_y = wg_size_y == 0 ? 1 : wg_size_y;
+ max_z = wg_size_z == 0 ? 1 : wg_size_z;
+
+ size_t wg_size = max_x * max_y * max_z;
+ if (wg_size > PHSA_MAX_WG_SIZE)
+ phsa_fatal_error (2);
+
+ wg.private_segment_total_size = context->dp->private_segment_size * wg_size;
+ if (wg.private_segment_total_size > 0
+ && posix_memalign (&wg.private_base_ptr, PRIVATE_SEGMENT_ALIGN,
+ wg.private_segment_total_size)
+ != 0)
+ phsa_fatal_error (3);
+
+ wg.alloca_stack_p = wg.private_segment_total_size;
+ wg.alloca_frame_p = wg.alloca_stack_p;
+
+#ifdef EXECUTE_WGS_BACKWARDS
+ wg.x = context->wg_max_x - 1;
+ wg.y = context->wg_max_y - 1;
+ wg.z = context->wg_max_z - 1;
+#else
+ wg.x = context->wg_min_x;
+ wg.y = context->wg_min_y;
+ wg.z = context->wg_min_z;
+#endif
+
+ fiber_barrier_init (&wg_sync_barrier, wg_size);
+ fiber_barrier_init (&wg_start_barrier, wg_size);
+ fiber_barrier_init (&wg_completion_barrier, wg_size);
+
+ context->wg_start_barrier = &wg_start_barrier;
+ context->wg_sync_barrier = &wg_sync_barrier;
+ context->wg_completion_barrier = &wg_completion_barrier;
+
+ wg.more_wgs = 1;
+ wg.group_base_ptr = group_base_ptr;
+
+#ifdef BENCHMARK_PHSA_RT
+ wi_count = 0;
+ wis_skipped = 0;
+ start_time = clock ();
+#endif
+ wi_threads = malloc (sizeof (PHSAWorkItem) * max_x * max_y * max_z);
+ for (x = 0; x < max_x; ++x)
+ for (y = 0; y < max_y; ++y)
+ for (z = 0; z < max_z; ++z)
+ {
+ PHSAWorkItem *wi = &wi_threads[flat_wi_id];
+ wi->launch_data = context;
+ wi->wg = &wg;
+ wi->x = x;
+ wi->y = y;
+ wi->z = z;
+
+ /* TODO: set the stack size according to the private
+ segment size. Too big stack consumes huge amount of
+ memory in case of huge number of WIs and a too small stack
+ will fail in mysterious and potentially dangerous ways. */
+
+ fiber_init (&wi->fiber, phsa_work_item_thread, wi,
+ FIBER_STACK_SIZE, PRIVATE_SEGMENT_ALIGN);
+ ++flat_wi_id;
+ }
+
+ do
+ {
+ --flat_wi_id;
+ fiber_join (&wi_threads[flat_wi_id].fiber);
+ }
+ while (flat_wi_id > 0);
+
+ if (wg.private_segment_total_size > 0)
+ free (wg.private_base_ptr);
+
+ free (wi_threads);
+}
+
+/* Spawn the work-item threads to execute work-groups and let
+ them execute all the WGs, including a potential partial WG. */
+
+static void
+phsa_spawn_work_items (PHSAKernelLaunchData *context, void *group_base_ptr)
+{
+ hsa_kernel_dispatch_packet_t *dp = context->dp;
+ size_t x, y, z;
+
+ /* TO DO: host-side memory management of group and private segment
+ memory. Agents in general are less likely to support efficient dynamic mem
+ allocation. */
+ if (dp->group_segment_size > 0
+ && posix_memalign (&group_base_ptr, PRIVATE_SEGMENT_ALIGN,
+ dp->group_segment_size) != 0)
+ phsa_fatal_error (3);
+
+ context->group_segment_start_addr = (size_t) group_base_ptr;
+
+ /* HSA seems to allow the WG size to be larger than the grid size. We need to
+ saturate the effective WG size to the grid size to prevent the extra WIs
+ from executing. */
+ size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size;
+ sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x);
+ sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y);
+ sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z);
+ sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z;
+
+#ifdef BENCHMARK_PHSA_RT
+ wi_total = (uint64_t) dp->grid_size_x
+ * (dp->grid_size_y > 0 ? dp->grid_size_y : 1)
+ * (dp->grid_size_z > 0 ? dp->grid_size_z : 1);
+#endif
+
+ /* For now execute all work groups in a single coarse thread (does not utilize
+ multicore/multithread). */
+ context->wg_min_x = context->wg_min_y = context->wg_min_z = 0;
+
+ int dims = dp->setup & 0x3;
+
+ context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1)
+ / dp->workgroup_size_x;
+
+ context->wg_max_y
+ = dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1)
+ / dp->workgroup_size_y;
+
+ context->wg_max_z
+ = dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1)
+ / dp->workgroup_size_z;
+
+#ifdef DEBUG_PHSA_RT
+ printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
+ "wg size %lu/%lu/%lu grid size %u/%u/%u\n",
+ context->wg_min_x, context->wg_min_y, context->wg_min_z,
+ context->wg_max_x, context->wg_max_y, context->wg_max_z,
+ sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x,
+ dp->grid_size_y, dp->grid_size_z);
+#endif
+
+ phsa_execute_wi_gang (context, group_base_ptr, sat_wg_size_x, sat_wg_size_y,
+ sat_wg_size_z);
+
+ if (dp->group_segment_size > 0)
+ free (group_base_ptr);
+}
+#endif
+
+/* Executes the given work-group function for all work groups in the grid.
+
+ A work-group function is a version of the original kernel which executes
+ the kernel for all work-items in a work-group. It is produced by gccbrig
+ if it can handle the kernel's barrier usage and is much faster way to
+ execute massive numbers of work-items in a non-SPMD machine than fibers
+ (easily 100x faster). */
+static void
+phsa_execute_work_groups (PHSAKernelLaunchData *context, void *group_base_ptr)
+{
+ hsa_kernel_dispatch_packet_t *dp = context->dp;
+ size_t x, y, z, wg_x, wg_y, wg_z;
+
+ /* TODO: host-side memory management of group and private segment
+ memory. Agents in general are less likely to support efficient dynamic mem
+ allocation. */
+ if (dp->group_segment_size > 0
+ && posix_memalign (&group_base_ptr, GROUP_SEGMENT_ALIGN,
+ dp->group_segment_size) != 0)
+ phsa_fatal_error (3);
+
+ context->group_segment_start_addr = (size_t) group_base_ptr;
+
+ /* HSA seems to allow the WG size to be larger than the grid size. We need
+ to saturate the effective WG size to the grid size to prevent the extra WIs
+ from executing. */
+ size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size;
+ sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x);
+ sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y);
+ sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z);
+ sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z;
+
+#ifdef BENCHMARK_PHSA_RT
+ wi_total = (uint64_t) dp->grid_size_x
+ * (dp->grid_size_y > 0 ? dp->grid_size_y : 1)
+ * (dp->grid_size_z > 0 ? dp->grid_size_z : 1);
+#endif
+
+ context->wg_min_x = context->wg_min_y = context->wg_min_z = 0;
+
+ int dims = dp->setup & 0x3;
+
+ context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1)
+ / dp->workgroup_size_x;
+
+ context->wg_max_y
+ = dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1)
+ / dp->workgroup_size_y;
+
+ context->wg_max_z
+ = dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1)
+ / dp->workgroup_size_z;
+
+#ifdef DEBUG_PHSA_RT
+ printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
+ "wg size %lu/%lu/%lu grid size %u/%u/%u\n",
+ context->wg_min_x, context->wg_min_y, context->wg_min_z,
+ context->wg_max_x, context->wg_max_y, context->wg_max_z,
+ sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x,
+ dp->grid_size_y, dp->grid_size_z);
+#endif
+
+ PHSAWorkItem wi;
+ PHSAWorkGroup wg;
+ wi.wg = &wg;
+ wi.x = wi.y = wi.z = 0;
+ wi.launch_data = context;
+
+#ifdef BENCHMARK_PHSA_RT
+ start_time = clock ();
+ uint64_t wg_count = 0;
+#endif
+
+ size_t wg_size = __hsail_workgroupsize (0, &wi)
+ * __hsail_workgroupsize (1, &wi)
+ * __hsail_workgroupsize (2, &wi);
+
+ void *private_base_ptr = NULL;
+ if (dp->private_segment_size > 0
+ && posix_memalign (&private_base_ptr, PRIVATE_SEGMENT_ALIGN,
+ dp->private_segment_size * wg_size)
+ != 0)
+ phsa_fatal_error (3);
+
+ wg.alloca_stack_p = dp->private_segment_size * wg_size;
+ wg.alloca_frame_p = wg.alloca_stack_p;
+
+ wg.private_base_ptr = private_base_ptr;
+ wg.group_base_ptr = group_base_ptr;
+
+#ifdef DEBUG_PHSA_RT
+ printf ("priv seg size %u wg_size %lu @ %p\n", dp->private_segment_size,
+ wg_size, private_base_ptr);
+#endif
+
+ for (wg_z = context->wg_min_z; wg_z < context->wg_max_z; ++wg_z)
+ for (wg_y = context->wg_min_y; wg_y < context->wg_max_y; ++wg_y)
+ for (wg_x = context->wg_min_x; wg_x < context->wg_max_x; ++wg_x)
+ {
+ wi.wg->x = wg_x;
+ wi.wg->y = wg_y;
+ wi.wg->z = wg_z;
+
+ context->kernel (context->kernarg_addr, &wi, group_base_ptr,
+ private_base_ptr);
+
+#if defined (BENCHMARK_PHSA_RT)
+ wg_count++;
+ if (wg_count % 1000000 == 0)
+ {
+ clock_t spent_time = clock () - start_time;
+ uint64_t wi_count = wg_x * sat_wg_size_x + wg_y * sat_wg_size_y
+ + wg_z * sat_wg_size_z;
+ double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
+ double wis_per_sec = wi_count / spent_time_sec;
+ uint64_t eta_sec = (wi_total - wi_count) / wis_per_sec;
+
+ printf ("%lu WIs executed in %lus (%lu WIs/s, ETA in %lu s)\n",
+ wi_count, (uint64_t) spent_time_sec,
+ (uint64_t) wis_per_sec, (uint64_t) eta_sec);
+ }
+#endif
+ }
+
+#ifdef BENCHMARK_PHSA_RT
+ clock_t spent_time = clock () - start_time;
+ double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
+ double wis_per_sec = wi_total / spent_time_sec;
+
+ printf ("### %lu WIs executed in %lu s (%lu WIs / s)\n", wi_total,
+ (uint64_t) spent_time_sec, (uint64_t) wis_per_sec);
+#endif
+
+ if (dp->group_segment_size > 0)
+ free (group_base_ptr);
+
+ free (private_base_ptr);
+ private_base_ptr = NULL;
+}
+
+/* gccbrig generates the following from each HSAIL kernel:
+
+ 1) The actual kernel function (a single work-item kernel or a work-group
+ function) generated from HSAIL (BRIG).
+
+ static void _Kernel (void* args, void* context, void* group_base_ptr)
+ {
+ ...
+ }
+
+ 2) A public facing kernel function that is called from the PHSA runtime:
+
+ a) A single work-item function (that requires fibers for multi-WI):
+
+ void Kernel (void* context)
+ {
+ __launch_launch_kernel (_Kernel, context);
+ }
+
+ or
+
+ b) a when gccbrig could generate a work-group function:
+
+ void Kernel (void* context)
+ {
+ __hsail_launch_wg_function (_Kernel, context);
+ }
+*/
+
+#ifdef HAVE_FIBERS
+
+void
+__hsail_launch_kernel (gccbrigKernelFunc kernel, PHSAKernelLaunchData *context,
+ void *group_base_ptr)
+{
+ context->kernel = kernel;
+ phsa_spawn_work_items (context, group_base_ptr);
+}
+#endif
+
+void
+__hsail_launch_wg_function (gccbrigKernelFunc kernel,
+ PHSAKernelLaunchData *context, void *group_base_ptr)
+{
+ context->kernel = kernel;
+ phsa_execute_work_groups (context, group_base_ptr);
+}
+
+uint32_t
+__hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context)
+{
+ hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
+
+ uint32_t id;
+ switch (dim)
+ {
+ default:
+ case 0:
+ /* Overflow semantics in the case of WG dim > grid dim. */
+ id = ((uint64_t) context->wg->x * dp->workgroup_size_x + context->x)
+ % dp->grid_size_x;
+ break;
+ case 1:
+ id = ((uint64_t) context->wg->y * dp->workgroup_size_y + context->y)
+ % dp->grid_size_y;
+ break;
+ case 2:
+ id = ((uint64_t) context->wg->z * dp->workgroup_size_z + context->z)
+ % dp->grid_size_z;
+ break;
+ }
+ return id;
+}
+
+uint64_t
+__hsail_workitemabsid_u64 (uint32_t dim, PHSAWorkItem *context)
+{
+ hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
+
+ uint64_t id;
+ switch (dim)
+ {
+ default:
+ case 0:
+ /* Overflow semantics in the case of WG dim > grid dim. */
+ id = ((uint64_t) context->wg->x * dp->workgroup_size_x + context->x)
+ % dp->grid_size_x;
+ break;
+ case 1:
+ id = ((uint64_t) context->wg->y * dp->workgroup_size_y + context->y)
+ % dp->grid_size_y;
+ break;
+ case 2:
+ id = ((uint64_t) context->wg->z * dp->workgroup_size_z + context->z)
+ % dp->grid_size_z;
+ break;
+ }
+ return id;
+}
+
+
+uint32_t
+__hsail_workitemid (uint32_t dim, PHSAWorkItem *context)
+{
+ PHSAWorkItem *c = (PHSAWorkItem *) context;
+ hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
+
+ /* The number of dimensions is in the two least significant bits. */
+ int dims = dp->setup & 0x3;
+
+ uint32_t id;
+ switch (dim)
+ {
+ default:
+ case 0:
+ id = c->x;
+ break;
+ case 1:
+ id = dims < 2 ? 0 : c->y;
+ break;
+ case 2:
+ id = dims < 3 ? 0 : c->z;
+ break;
+ }
+ return id;
+}
+
+uint32_t
+__hsail_gridgroups (uint32_t dim, PHSAWorkItem *context)
+{
+ hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
+ int dims = dp->setup & 0x3;
+
+ uint32_t id;
+ switch (dim)
+ {
+ default:
+ case 0:
+ id = (dp->grid_size_x + dp->workgroup_size_x - 1) / dp->workgroup_size_x;
+ break;
+ case 1:
+ id = dims < 2 ? 1 : (dp->grid_size_y + dp->workgroup_size_y - 1)
+ / dp->workgroup_size_y;
+ break;
+ case 2:
+ id = dims < 3 ? 1 : (dp->grid_size_z + dp->workgroup_size_z - 1)
+ / dp->workgroup_size_z;
+ break;
+ }
+ return id;
+}
+
+uint32_t
+__hsail_workitemflatid (PHSAWorkItem *c)
+{
+ hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp;
+
+ return c->x + c->y * dp->workgroup_size_x
+ + c->z * dp->workgroup_size_x * dp->workgroup_size_y;
+}
+
+uint32_t
+__hsail_currentworkitemflatid (PHSAWorkItem *c)
+{
+ hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp;
+
+ return c->x + c->y * __hsail_currentworkgroupsize (0, c)
+ + c->z * __hsail_currentworkgroupsize (0, c)
+ * __hsail_currentworkgroupsize (1, c);
+}
+
+void
+__hsail_setworkitemid (uint32_t dim, uint32_t id, PHSAWorkItem *context)
+{
+ switch (dim)
+ {
+ default:
+ case 0:
+ context->x = id;
+ break;
+ case 1:
+ context->y = id;
+ break;
+ case 2:
+ context->z = id;
+ break;
+ }
+}
+
+uint64_t
+__hsail_workitemflatabsid_u64 (PHSAWorkItem *context)
+{
+ PHSAWorkItem *c = (PHSAWorkItem *) context;
+ hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
+
+ /* Work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1. */
+ uint64_t id0 = __hsail_workitemabsid (0, context);
+ uint64_t id1 = __hsail_workitemabsid (1, context);
+ uint64_t id2 = __hsail_workitemabsid (2, context);
+
+ uint64_t max0 = dp->grid_size_x;
+ uint64_t max1 = dp->grid_size_y;
+ uint64_t id = id0 + id1 * max0 + id2 * max0 * max1;
+
+ return id;
+}
+
+uint32_t
+__hsail_workitemflatabsid_u32 (PHSAWorkItem *context)
+{
+ PHSAWorkItem *c = (PHSAWorkItem *) context;
+ hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
+
+ /* work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1. */
+ uint64_t id0 = __hsail_workitemabsid (0, context);
+ uint64_t id1 = __hsail_workitemabsid (1, context);
+ uint64_t id2 = __hsail_workitemabsid (2, context);
+
+ uint64_t max0 = dp->grid_size_x;
+ uint64_t max1 = dp->grid_size_y;
+ uint64_t id = id0 + id1 * max0 + id2 * max0 * max1;
+ return (uint32_t) id;
+}
+
+uint32_t
+__hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi)
+{
+ hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
+ uint32_t wg_size = 0;
+ switch (dim)
+ {
+ default:
+ case 0:
+ if ((uint64_t) wi->wg->x < dp->grid_size_x / dp->workgroup_size_x)
+ wg_size = dp->workgroup_size_x; /* Full WG. */
+ else
+ wg_size = dp->grid_size_x % dp->workgroup_size_x; /* Partial WG. */
+ break;
+ case 1:
+ if ((uint64_t) wi->wg->y < dp->grid_size_y / dp->workgroup_size_y)
+ wg_size = dp->workgroup_size_y; /* Full WG. */
+ else
+ wg_size = dp->grid_size_y % dp->workgroup_size_y; /* Partial WG. */
+ break;
+ case 2:
+ if ((uint64_t) wi->wg->z < dp->grid_size_z / dp->workgroup_size_z)
+ wg_size = dp->workgroup_size_z; /* Full WG. */
+ else
+ wg_size = dp->grid_size_z % dp->workgroup_size_z; /* Partial WG. */
+ break;
+ }
+ return wg_size;
+}
+
+uint32_t
+__hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi)
+{
+ hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
+ switch (dim)
+ {
+ default:
+ case 0:
+ return dp->workgroup_size_x;
+ case 1:
+ return dp->workgroup_size_y;
+ case 2:
+ return dp->workgroup_size_z;
+ }
+}
+
+uint32_t
+__hsail_gridsize (uint32_t dim, PHSAWorkItem *wi)
+{
+ hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
+ switch (dim)
+ {
+ default:
+ case 0:
+ return dp->grid_size_x;
+ case 1:
+ return dp->grid_size_y;
+ case 2:
+ return dp->grid_size_z;
+ }
+}
+
+uint32_t
+__hsail_workgroupid (uint32_t dim, PHSAWorkItem *wi)
+{
+ switch (dim)
+ {
+ default:
+ case 0:
+ return wi->wg->x;
+ case 1:
+ return wi->wg->y;
+ case 2:
+ return wi->wg->z;
+ }
+}
+
+uint32_t
+__hsail_dim (PHSAWorkItem *wi)
+{
+ hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
+ return dp->setup & 0x3;
+}
+
+uint64_t
+__hsail_packetid (PHSAWorkItem *wi)
+{
+ return wi->launch_data->packet_id;
+}
+
+uint32_t
+__hsail_packetcompletionsig_sig32 (PHSAWorkItem *wi)
+{
+ return (uint32_t) wi->launch_data->dp->completion_signal.handle;
+}
+
+uint64_t
+__hsail_packetcompletionsig_sig64 (PHSAWorkItem *wi)
+{
+ return (uint64_t) (wi->launch_data->dp->completion_signal.handle);
+}
+
+#ifdef HAVE_FIBERS
+void
+__hsail_barrier (PHSAWorkItem *wi)
+{
+ fiber_barrier_reach ((fiber_barrier_t *) wi->launch_data->wg_sync_barrier);
+}
+#endif
+
+/* Return a 32b private segment address that points to a dynamically
+ allocated chunk of 'size' with 'align'.
+
+ Allocates the space from the end of the private segment allocated
+ for the whole work group. In implementations with separate private
+ memories per WI, we will need to have a stack pointer per WI. But in
+ the current implementation, the segment is shared, so we possibly
+ save some space in case all WIs do not call the alloca.
+
+ The "alloca frames" are organized as follows:
+
+ wg->alloca_stack_p points to the last allocated data (initially
+ outside the private segment)
+ wg->alloca_frame_p points to the first address _outside_ the current
+ function's allocations (initially to the same as alloca_stack_p)
+
+ The data is allocated downwards from the end of the private segment.
+
+ In the beginning of a new function which has allocas, a new alloca
+ frame is pushed which adds the current alloca_frame_p (the current
+ function's frame starting point) to the top of the alloca stack and
+ alloca_frame_p is set to the current stack position.
+
+ At the exit points of a function with allocas, the alloca frame
+ is popped before returning. This involves popping the alloca_frame_p
+ to the one of the previous function in the call stack, and alloca_stack_p
+ similarly, to the position of the last word alloca'd by the previous
+ function.
+ */
+
+uint32_t
+__hsail_alloca (uint32_t size, uint32_t align, PHSAWorkItem *wi)
+{
+ volatile PHSAWorkGroup *wg = wi->wg;
+ uint32_t new_pos = wg->alloca_stack_p - size;
+ while (new_pos % align != 0)
+ new_pos--;
+ wg->alloca_stack_p = new_pos;
+
+#ifdef DEBUG_ALLOCA
+ printf ("--- alloca (%u, %u) sp @%u fp @%u\n", size, align,
+ wg->alloca_stack_p, wg->alloca_frame_p);
+#endif
+ return new_pos;
+}
+
+/* Initializes a new "alloca frame" in the private segment.
+ This should be called at all the function entry points in case
+ the function contains at least one call to alloca. */
+
+void
+__hsail_alloca_push_frame (PHSAWorkItem *wi)
+{
+ volatile PHSAWorkGroup *wg = wi->wg;
+
+ /* Store the alloca_frame_p without any alignment padding so
+ we know exactly where the previous frame ended after popping
+ it. */
+#ifdef DEBUG_ALLOCA
+ printf ("--- push frame ");
+#endif
+ uint32_t last_word_offs = __hsail_alloca (4, 1, wi);
+ memcpy (wg->private_base_ptr + last_word_offs,
+ (const void *) &wg->alloca_frame_p, 4);
+ wg->alloca_frame_p = last_word_offs;
+
+#ifdef DEBUG_ALLOCA
+ printf ("--- sp @%u fp @%u\n", wg->alloca_stack_p, wg->alloca_frame_p);
+#endif
+}
+
+/* Frees the current "alloca frame" and restores the frame
+ pointer.
+ This should be called at all the function return points in case
+ the function contains at least one call to alloca. Restores the
+ alloca stack to the condition it was before pushing the frame
+ the last time. */
+void
+__hsail_alloca_pop_frame (PHSAWorkItem *wi)
+{
+ volatile PHSAWorkGroup *wg = wi->wg;
+
+ wg->alloca_stack_p = wg->alloca_frame_p;
+ memcpy (&wg->alloca_frame_p,
+ (const void *) (wg->private_base_ptr + wg->alloca_frame_p), 4);
+ /* Now frame_p points to the beginning of the previous function's
+ frame and stack_p to its end. */
+
+ wg->alloca_stack_p += 4;
+
+#ifdef DEBUG_ALLOCA
+ printf ("--- pop frame sp @%u fp @%u\n", wg->alloca_stack_p,
+ wg->alloca_frame_p);
+#endif
+}