From 22a3f1b30c8b6f907066fc19081e0c423feb0c7f Mon Sep 17 00:00:00 2001 From: Christoph Muellner Date: Fri, 14 Sep 2018 10:33:06 +0200 Subject: aarch64: Don't use DC ZVA for memset. This optimises memset for SPEC rate runs on Xgene3 processors. Provided by Feng Xue. Signed-off-by: Christoph Muellner --- sysdeps/aarch64/memset.S | 155 ++++++++++++++++++++++++++++++----------------- 1 file changed, 98 insertions(+), 57 deletions(-) diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S index 4a45459361..3de9bce730 100644 --- a/sysdeps/aarch64/memset.S +++ b/sysdeps/aarch64/memset.S @@ -34,21 +34,24 @@ ENTRY_ALIGN (MEMSET, 6) DELOUSE (0) DELOUSE (2) - dup v0.16B, valw - add dstend, dstin, count + bfi valw, valw, 8, 8 + bfi valw, valw, 16, 16 + bfi val, val, 32, 32 + +1: add dstend, dstin, count cmp count, 96 b.hi L(set_long) cmp count, 16 b.hs L(set_medium) - mov val, v0.D[0] /* Set 0..15 bytes. */ tbz count, 3, 1f str val, [dstin] str val, [dstend, -8] ret - nop + + .p2align 3 1: tbz count, 2, 2f str valw, [dstin] str valw, [dstend, -4] @@ -59,45 +62,50 @@ ENTRY_ALIGN (MEMSET, 6) strh valw, [dstend, -2] 3: ret - /* Set 17..96 bytes. */ + .p2align 3 + /* Set 16..96 bytes. */ L(set_medium): - str q0, [dstin] + stp val, val, [dstin] tbnz count, 6, L(set96) - str q0, [dstend, -16] + stp val, val, [dstend, -16] tbz count, 5, 1f - str q0, [dstin, 16] - str q0, [dstend, -32] + stp val, val, [dstin, 16] + stp val, val, [dstend, -32] 1: ret .p2align 4 /* Set 64..96 bytes. Write 64 bytes from the start and 32 bytes from the end. */ L(set96): - str q0, [dstin, 16] - stp q0, q0, [dstin, 32] - stp q0, q0, [dstend, -32] + stp val, val, [dstin, 16] + stp val, val, [dstin, 32] + stp val, val, [dstin, 48] + stp val, val, [dstend, -32] + stp val, val, [dstend, -16] ret .p2align 3 - nop L(set_long): - and valw, valw, 255 + stp val, val, [dstin] bic dst, dstin, 15 - str q0, [dstin] - cmp count, 256 - ccmp valw, 0, 0, cs - b.eq L(try_zva) +# cmp count, 512 +# ccmp val, 0, 0, cs +# b.eq L(try_zva) L(no_zva): sub count, dstend, dst /* Count is 16 too large. */ - add dst, dst, 16 - sub count, count, 64 + 16 /* Adjust count and bias for loop. */ -1: stp q0, q0, [dst], 64 - stp q0, q0, [dst, -32] + sub count, count, 64+16+1 /* Adjust count and bias for loop. */ +1: stp val, val, [dst, 16] + stp val, val, [dst, 32] + stp val, val, [dst, 48] + stp val, val, [dst, 64]! L(tail64): subs count, count, 64 - b.hi 1b -2: stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] + b.hs 1b + tbz count, 5, 1f + stp val, val, [dst, 16] + stp val, val, [dst, 32] +1: stp val, val, [dstend, -32] + stp val, val, [dstend, -16] ret L(try_zva): @@ -111,27 +119,48 @@ L(try_zva): cmp tmp1w, 4 /* ZVA size is 64 bytes. */ b.ne L(zva_128) - /* Write the first and last 64 byte aligned block using stp rather - than using DC ZVA. This is faster on some cores. - */ L(zva_64): - str q0, [dst, 16] - stp q0, q0, [dst, 32] + stp val, val, [dst, 16] + stp val, val, [dst, 32] + stp val, val, [dst, 48] bic dst, dst, 63 - stp q0, q0, [dst, 64] - stp q0, q0, [dst, 96] + + /* + * Above memory writes might cross cache line boundary, and cause a + * partially dirty cache line. But it seems that DC ZVA can not handle + * zeroing of partial dirty cache line efficiently, probably it still + * requires load of untouched part of the cache line before zeroing. + * + * Write the first 64 byte aligned block using stp to force a fully + * dirty cache line. + */ + stp val, val, [dst, 64] + stp val, val, [dst, 80] + stp val, val, [dst, 96] + stp val, val, [dst, 112] + sub count, dstend, dst /* Count is now 128 too large. */ - sub count, count, 128+64+64 /* Adjust count and bias for loop. */ + sub count, count, 128+64+64+1 /* Adjust count and bias for loop. */ add dst, dst, 128 - nop 1: dc zva, dst add dst, dst, 64 subs count, count, 64 - b.hi 1b - stp q0, q0, [dst, 0] - stp q0, q0, [dst, 32] - stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] + b.hs 1b + + /* + * Write the last 64 byte aligned block using stp to force a fully + * dirty cache line. + */ + stp val, val, [dst, 0] + stp val, val, [dst, 16] + stp val, val, [dst, 32] + stp val, val, [dst, 48] + + tbz count, 5, 1f + stp val, val, [dst, 64] + stp val, val, [dst, 80] +1: stp val, val, [dstend, -32] + stp val, val, [dstend, -16] ret .p2align 3 @@ -139,22 +168,29 @@ L(zva_128): cmp tmp1w, 5 /* ZVA size is 128 bytes. */ b.ne L(zva_other) - str q0, [dst, 16] - stp q0, q0, [dst, 32] - stp q0, q0, [dst, 64] - stp q0, q0, [dst, 96] + stp val, val, [dst, 16] + stp val, val, [dst, 32] + stp val, val, [dst, 48] + stp val, val, [dst, 64] + stp val, val, [dst, 80] + stp val, val, [dst, 96] + stp val, val, [dst, 112] bic dst, dst, 127 sub count, dstend, dst /* Count is now 128 too large. */ - sub count, count, 128+128 /* Adjust count and bias for loop. */ - add dst, dst, 128 -1: dc zva, dst - add dst, dst, 128 + sub count, count, 128+128+1 /* Adjust count and bias for loop. */ +1: add dst, dst, 128 + dc zva, dst subs count, count, 128 - b.hi 1b - stp q0, q0, [dstend, -128] - stp q0, q0, [dstend, -96] - stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] + b.hs 1b + tbz count, 6, 1f + stp val, val, [dst, 128] + stp val, val, [dst, 144] + stp val, val, [dst, 160] + stp val, val, [dst, 176] +1: stp val, val, [dstend, -64] + stp val, val, [dstend, -48] + stp val, val, [dstend, -32] + stp val, val, [dstend, -16] ret L(zva_other): @@ -162,16 +198,18 @@ L(zva_other): lsl zva_lenw, tmp2w, tmp1w add tmp1, zva_len, 64 /* Max alignment bytes written. */ cmp count, tmp1 - blo L(no_zva) + b.lo L(no_zva) sub tmp2, zva_len, 1 add tmp1, dst, zva_len + bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ add dst, dst, 16 subs count, tmp1, dst /* Actual alignment bytes to write. */ - bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ beq 2f -1: stp q0, q0, [dst], 64 - stp q0, q0, [dst, -32] +1: stp val, val, [dst], 64 + stp val, val, [dst, -48] + stp val, val, [dst, -32] + stp val, val, [dst, -16] subs count, count, 64 b.hi 1b 2: mov dst, tmp1 @@ -182,7 +220,10 @@ L(zva_other): add dst, dst, zva_len subs count, count, zva_len b.hs 3b -4: add count, count, zva_len + cbnz count, 4f + ret +4: add count, count, tmp2 + sub dst, dst, 16 b L(tail64) #endif -- cgit v1.2.3