summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Muellner <christoph.muellner@theobroma-systems.com>2018-09-14 10:33:06 +0200
committerChristoph Muellner <christoph.muellner@theobroma-systems.com>2018-09-14 10:33:06 +0200
commit22a3f1b30c8b6f907066fc19081e0c423feb0c7f (patch)
tree1b5721c2cfdfc5b41278420468ca649f4eb8bfd5
parenta1f110b8774c7a61a6e79cc65c03d8250b0b2544 (diff)
aarch64: Don't use DC ZVA for memset.glibc-2.28-amp-branch
This optimises memset for SPEC rate runs on Xgene3 processors. Provided by Feng Xue. Signed-off-by: Christoph Muellner <christoph.muellner@theobroma-systems.com>
-rw-r--r--sysdeps/aarch64/memset.S155
1 files changed, 98 insertions, 57 deletions
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index 4a45459361..3de9bce730 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -34,21 +34,24 @@ ENTRY_ALIGN (MEMSET, 6)
DELOUSE (0)
DELOUSE (2)
- dup v0.16B, valw
- add dstend, dstin, count
+ bfi valw, valw, 8, 8
+ bfi valw, valw, 16, 16
+ bfi val, val, 32, 32
+
+1: add dstend, dstin, count
cmp count, 96
b.hi L(set_long)
cmp count, 16
b.hs L(set_medium)
- mov val, v0.D[0]
/* Set 0..15 bytes. */
tbz count, 3, 1f
str val, [dstin]
str val, [dstend, -8]
ret
- nop
+
+ .p2align 3
1: tbz count, 2, 2f
str valw, [dstin]
str valw, [dstend, -4]
@@ -59,45 +62,50 @@ ENTRY_ALIGN (MEMSET, 6)
strh valw, [dstend, -2]
3: ret
- /* Set 17..96 bytes. */
+ .p2align 3
+ /* Set 16..96 bytes. */
L(set_medium):
- str q0, [dstin]
+ stp val, val, [dstin]
tbnz count, 6, L(set96)
- str q0, [dstend, -16]
+ stp val, val, [dstend, -16]
tbz count, 5, 1f
- str q0, [dstin, 16]
- str q0, [dstend, -32]
+ stp val, val, [dstin, 16]
+ stp val, val, [dstend, -32]
1: ret
.p2align 4
/* Set 64..96 bytes. Write 64 bytes from the start and
32 bytes from the end. */
L(set96):
- str q0, [dstin, 16]
- stp q0, q0, [dstin, 32]
- stp q0, q0, [dstend, -32]
+ stp val, val, [dstin, 16]
+ stp val, val, [dstin, 32]
+ stp val, val, [dstin, 48]
+ stp val, val, [dstend, -32]
+ stp val, val, [dstend, -16]
ret
.p2align 3
- nop
L(set_long):
- and valw, valw, 255
+ stp val, val, [dstin]
bic dst, dstin, 15
- str q0, [dstin]
- cmp count, 256
- ccmp valw, 0, 0, cs
- b.eq L(try_zva)
+# cmp count, 512
+# ccmp val, 0, 0, cs
+# b.eq L(try_zva)
L(no_zva):
sub count, dstend, dst /* Count is 16 too large. */
- add dst, dst, 16
- sub count, count, 64 + 16 /* Adjust count and bias for loop. */
-1: stp q0, q0, [dst], 64
- stp q0, q0, [dst, -32]
+ sub count, count, 64+16+1 /* Adjust count and bias for loop. */
+1: stp val, val, [dst, 16]
+ stp val, val, [dst, 32]
+ stp val, val, [dst, 48]
+ stp val, val, [dst, 64]!
L(tail64):
subs count, count, 64
- b.hi 1b
-2: stp q0, q0, [dstend, -64]
- stp q0, q0, [dstend, -32]
+ b.hs 1b
+ tbz count, 5, 1f
+ stp val, val, [dst, 16]
+ stp val, val, [dst, 32]
+1: stp val, val, [dstend, -32]
+ stp val, val, [dstend, -16]
ret
L(try_zva):
@@ -111,27 +119,48 @@ L(try_zva):
cmp tmp1w, 4 /* ZVA size is 64 bytes. */
b.ne L(zva_128)
- /* Write the first and last 64 byte aligned block using stp rather
- than using DC ZVA. This is faster on some cores.
- */
L(zva_64):
- str q0, [dst, 16]
- stp q0, q0, [dst, 32]
+ stp val, val, [dst, 16]
+ stp val, val, [dst, 32]
+ stp val, val, [dst, 48]
bic dst, dst, 63
- stp q0, q0, [dst, 64]
- stp q0, q0, [dst, 96]
+
+ /*
+ * Above memory writes might cross cache line boundary, and cause a
+ * partially dirty cache line. But it seems that DC ZVA can not handle
+ * zeroing of partial dirty cache line efficiently, probably it still
+ * requires load of untouched part of the cache line before zeroing.
+ *
+ * Write the first 64 byte aligned block using stp to force a fully
+ * dirty cache line.
+ */
+ stp val, val, [dst, 64]
+ stp val, val, [dst, 80]
+ stp val, val, [dst, 96]
+ stp val, val, [dst, 112]
+
sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128+64+64 /* Adjust count and bias for loop. */
+ sub count, count, 128+64+64+1 /* Adjust count and bias for loop. */
add dst, dst, 128
- nop
1: dc zva, dst
add dst, dst, 64
subs count, count, 64
- b.hi 1b
- stp q0, q0, [dst, 0]
- stp q0, q0, [dst, 32]
- stp q0, q0, [dstend, -64]
- stp q0, q0, [dstend, -32]
+ b.hs 1b
+
+ /*
+ * Write the last 64 byte aligned block using stp to force a fully
+ * dirty cache line.
+ */
+ stp val, val, [dst, 0]
+ stp val, val, [dst, 16]
+ stp val, val, [dst, 32]
+ stp val, val, [dst, 48]
+
+ tbz count, 5, 1f
+ stp val, val, [dst, 64]
+ stp val, val, [dst, 80]
+1: stp val, val, [dstend, -32]
+ stp val, val, [dstend, -16]
ret
.p2align 3
@@ -139,22 +168,29 @@ L(zva_128):
cmp tmp1w, 5 /* ZVA size is 128 bytes. */
b.ne L(zva_other)
- str q0, [dst, 16]
- stp q0, q0, [dst, 32]
- stp q0, q0, [dst, 64]
- stp q0, q0, [dst, 96]
+ stp val, val, [dst, 16]
+ stp val, val, [dst, 32]
+ stp val, val, [dst, 48]
+ stp val, val, [dst, 64]
+ stp val, val, [dst, 80]
+ stp val, val, [dst, 96]
+ stp val, val, [dst, 112]
bic dst, dst, 127
sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128+128 /* Adjust count and bias for loop. */
- add dst, dst, 128
-1: dc zva, dst
- add dst, dst, 128
+ sub count, count, 128+128+1 /* Adjust count and bias for loop. */
+1: add dst, dst, 128
+ dc zva, dst
subs count, count, 128
- b.hi 1b
- stp q0, q0, [dstend, -128]
- stp q0, q0, [dstend, -96]
- stp q0, q0, [dstend, -64]
- stp q0, q0, [dstend, -32]
+ b.hs 1b
+ tbz count, 6, 1f
+ stp val, val, [dst, 128]
+ stp val, val, [dst, 144]
+ stp val, val, [dst, 160]
+ stp val, val, [dst, 176]
+1: stp val, val, [dstend, -64]
+ stp val, val, [dstend, -48]
+ stp val, val, [dstend, -32]
+ stp val, val, [dstend, -16]
ret
L(zva_other):
@@ -162,16 +198,18 @@ L(zva_other):
lsl zva_lenw, tmp2w, tmp1w
add tmp1, zva_len, 64 /* Max alignment bytes written. */
cmp count, tmp1
- blo L(no_zva)
+ b.lo L(no_zva)
sub tmp2, zva_len, 1
add tmp1, dst, zva_len
+ bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
add dst, dst, 16
subs count, tmp1, dst /* Actual alignment bytes to write. */
- bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
beq 2f
-1: stp q0, q0, [dst], 64
- stp q0, q0, [dst, -32]
+1: stp val, val, [dst], 64
+ stp val, val, [dst, -48]
+ stp val, val, [dst, -32]
+ stp val, val, [dst, -16]
subs count, count, 64
b.hi 1b
2: mov dst, tmp1
@@ -182,7 +220,10 @@ L(zva_other):
add dst, dst, zva_len
subs count, count, zva_len
b.hs 3b
-4: add count, count, zva_len
+ cbnz count, 4f
+ ret
+4: add count, count, tmp2
+ sub dst, dst, 16
b L(tail64)
#endif