/* strcpy with AVX2 Copyright (C) 2011-2018 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, see . */ #if IS_IN (libc) # ifndef USE_AS_STRCAT # include # ifndef STRCPY # define STRCPY __strcpy_avx2 # endif # endif /* Number of bytes in a vector register */ # ifndef VEC_SIZE # define VEC_SIZE 32 # endif # ifndef VZEROUPPER # define VZEROUPPER vzeroupper # endif /* zero register */ #define xmmZ xmm0 #define ymmZ ymm0 /* mask register */ #define ymmM ymm1 # ifndef USE_AS_STRCAT .section .text.avx,"ax",@progbits ENTRY (STRCPY) # ifdef USE_AS_STRNCPY mov %RDX_LP, %R8_LP test %R8_LP, %R8_LP jz L(ExitZero) # endif mov %rsi, %rcx # ifndef USE_AS_STPCPY mov %rdi, %rax /* save result */ # endif # endif vpxor %xmmZ, %xmmZ, %xmmZ and $((VEC_SIZE * 4) - 1), %ecx cmp $(VEC_SIZE * 2), %ecx jbe L(SourceStringAlignmentLessTwoVecSize) and $-VEC_SIZE, %rsi and $(VEC_SIZE - 1), %ecx vpcmpeqb (%rsi), %ymmZ, %ymmM vpmovmskb %ymmM, %edx shr %cl, %rdx # ifdef USE_AS_STRNCPY # if defined USE_AS_STPCPY || defined USE_AS_STRCAT mov $VEC_SIZE, %r10 sub %rcx, %r10 cmp %r10, %r8 # else mov $(VEC_SIZE + 1), %r10 sub %rcx, %r10 cmp %r10, %r8 # endif jbe L(CopyVecSizeTailCase2OrCase3) # endif test %edx, %edx jnz L(CopyVecSizeTail) vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2 vpmovmskb %ymm2, %edx # ifdef USE_AS_STRNCPY add $VEC_SIZE, %r10 cmp %r10, %r8 jbe L(CopyTwoVecSizeCase2OrCase3) # endif test %edx, %edx jnz L(CopyTwoVecSize) vmovdqu (%rsi, %rcx), %ymm2 /* copy VEC_SIZE bytes */ vmovdqu %ymm2, (%rdi) /* If source address alignment != destination address alignment */ .p2align 4 L(UnalignVecSizeBoth): sub %rcx, %rdi # ifdef USE_AS_STRNCPY add %rcx, %r8 sbb %rcx, %rcx or %rcx, %r8 # endif mov $VEC_SIZE, %rcx vmovdqa (%rsi, %rcx), %ymm2 vmovdqu %ymm2, (%rdi, %rcx) vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 vpcmpeqb %ymm2, %ymmZ, %ymmM vpmovmskb %ymmM, %edx add $VEC_SIZE, %rcx # ifdef USE_AS_STRNCPY sub $(VEC_SIZE * 3), %r8 jbe L(CopyVecSizeCase2OrCase3) # endif test %edx, %edx # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT jnz L(CopyVecSizeUnalignedVec2) # else jnz L(CopyVecSize) # endif vmovdqu %ymm2, (%rdi, %rcx) vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3 vpcmpeqb %ymm3, %ymmZ, %ymmM vpmovmskb %ymmM, %edx add $VEC_SIZE, %rcx # ifdef USE_AS_STRNCPY sub $VEC_SIZE, %r8 jbe L(CopyVecSizeCase2OrCase3) # endif test %edx, %edx # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT jnz L(CopyVecSizeUnalignedVec3) # else jnz L(CopyVecSize) # endif vmovdqu %ymm3, (%rdi, %rcx) vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4 vpcmpeqb %ymm4, %ymmZ, %ymmM vpmovmskb %ymmM, %edx add $VEC_SIZE, %rcx # ifdef USE_AS_STRNCPY sub $VEC_SIZE, %r8 jbe L(CopyVecSizeCase2OrCase3) # endif test %edx, %edx # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT jnz L(CopyVecSizeUnalignedVec4) # else jnz L(CopyVecSize) # endif vmovdqu %ymm4, (%rdi, %rcx) vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 vpcmpeqb %ymm2, %ymmZ, %ymmM vpmovmskb %ymmM, %edx add $VEC_SIZE, %rcx # ifdef USE_AS_STRNCPY sub $VEC_SIZE, %r8 jbe L(CopyVecSizeCase2OrCase3) # endif test %edx, %edx # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT jnz L(CopyVecSizeUnalignedVec2) # else jnz L(CopyVecSize) # endif vmovdqu %ymm2, (%rdi, %rcx) vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 vpcmpeqb %ymm2, %ymmZ, %ymmM vpmovmskb %ymmM, %edx add $VEC_SIZE, %rcx # ifdef USE_AS_STRNCPY sub $VEC_SIZE, %r8 jbe L(CopyVecSizeCase2OrCase3) # endif test %edx, %edx # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT jnz L(CopyVecSizeUnalignedVec2) # else jnz L(CopyVecSize) # endif vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3 vmovdqu %ymm2, (%rdi, %rcx) vpcmpeqb %ymm3, %ymmZ, %ymmM vpmovmskb %ymmM, %edx add $VEC_SIZE, %rcx # ifdef USE_AS_STRNCPY sub $VEC_SIZE, %r8 jbe L(CopyVecSizeCase2OrCase3) # endif test %edx, %edx # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT jnz L(CopyVecSizeUnalignedVec3) # else jnz L(CopyVecSize) # endif vmovdqu %ymm3, (%rdi, %rcx) mov %rsi, %rdx lea VEC_SIZE(%rsi, %rcx), %rsi and $-(VEC_SIZE * 4), %rsi sub %rsi, %rdx sub %rdx, %rdi # ifdef USE_AS_STRNCPY lea (VEC_SIZE * 8)(%r8, %rdx), %r8 # endif L(UnalignedFourVecSizeLoop): vmovdqa (%rsi), %ymm4 vmovdqa VEC_SIZE(%rsi), %ymm5 vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6 vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7 vpminub %ymm5, %ymm4, %ymm2 vpminub %ymm7, %ymm6, %ymm3 vpminub %ymm2, %ymm3, %ymm3 vpcmpeqb %ymmM, %ymm3, %ymm3 vpmovmskb %ymm3, %edx # ifdef USE_AS_STRNCPY sub $(VEC_SIZE * 4), %r8 jbe L(UnalignedLeaveCase2OrCase3) # endif test %edx, %edx jnz L(UnalignedFourVecSizeLeave) L(UnalignedFourVecSizeLoop_start): add $(VEC_SIZE * 4), %rdi add $(VEC_SIZE * 4), %rsi vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi) vmovdqa (%rsi), %ymm4 vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi) vmovdqa VEC_SIZE(%rsi), %ymm5 vpminub %ymm5, %ymm4, %ymm2 vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi) vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6 vmovdqu %ymm7, -VEC_SIZE(%rdi) vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7 vpminub %ymm7, %ymm6, %ymm3 vpminub %ymm2, %ymm3, %ymm3 vpcmpeqb %ymmM, %ymm3, %ymm3 vpmovmskb %ymm3, %edx # ifdef USE_AS_STRNCPY sub $(VEC_SIZE * 4), %r8 jbe L(UnalignedLeaveCase2OrCase3) # endif test %edx, %edx jz L(UnalignedFourVecSizeLoop_start) L(UnalignedFourVecSizeLeave): vpcmpeqb %ymm4, %ymmZ, %ymmM vpmovmskb %ymmM, %edx test %edx, %edx jnz L(CopyVecSizeUnaligned_0) vpcmpeqb %ymm5, %ymmZ, %ymmM vpmovmskb %ymmM, %ecx test %ecx, %ecx jnz L(CopyVecSizeUnaligned_16) vpcmpeqb %ymm6, %ymmZ, %ymmM vpmovmskb %ymmM, %edx test %edx, %edx jnz L(CopyVecSizeUnaligned_32) vpcmpeqb %ymm7, %ymmZ, %ymmM vpmovmskb %ymmM, %ecx bsf %ecx, %edx vmovdqu %ymm4, (%rdi) vmovdqu %ymm5, VEC_SIZE(%rdi) vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT # ifdef USE_AS_STPCPY lea (VEC_SIZE * 3)(%rdi, %rdx), %rax # endif vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi) add $(VEC_SIZE - 1), %r8 sub %rdx, %r8 lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi jmp L(StrncpyFillTailWithZero) # else add $(VEC_SIZE * 3), %rsi add $(VEC_SIZE * 3), %rdi jmp L(CopyVecSizeExit) # endif /* If source address alignment == destination address alignment */ L(SourceStringAlignmentLessTwoVecSize): vmovdqu (%rsi), %ymm3 vmovdqu VEC_SIZE(%rsi), %ymm2 vpcmpeqb %ymm3, %ymmZ, %ymmM vpmovmskb %ymmM, %edx # ifdef USE_AS_STRNCPY # if defined USE_AS_STPCPY || defined USE_AS_STRCAT cmp $VEC_SIZE, %r8 # else cmp $(VEC_SIZE + 1), %r8 # endif jbe L(CopyVecSizeTail1Case2OrCase3) # endif test %edx, %edx jnz L(CopyVecSizeTail1) vmovdqu %ymm3, (%rdi) vpcmpeqb %ymm2, %ymmZ, %ymmM vpmovmskb %ymmM, %edx # ifdef USE_AS_STRNCPY # if defined USE_AS_STPCPY || defined USE_AS_STRCAT cmp $(VEC_SIZE * 2), %r8 # else cmp $((VEC_SIZE * 2) + 1), %r8 # endif jbe L(CopyTwoVecSize1Case2OrCase3) # endif test %edx, %edx jnz L(CopyTwoVecSize1) and $-VEC_SIZE, %rsi and $(VEC_SIZE - 1), %ecx jmp L(UnalignVecSizeBoth) /*------End of main part with loops---------------------*/ /* Case1 */ # if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) .p2align 4 L(CopyVecSize): add %rcx, %rdi # endif L(CopyVecSizeTail): add %rcx, %rsi L(CopyVecSizeTail1): bsf %edx, %edx L(CopyVecSizeExit): cmp $32, %edx jae L(Exit32_63) cmp $16, %edx jae L(Exit16_31) cmp $8, %edx jae L(Exit8_15) cmp $4, %edx jae L(Exit4_7) cmp $3, %edx je L(Exit3) cmp $1, %edx ja L(Exit2) je L(Exit1) movb $0, (%rdi) # ifdef USE_AS_STPCPY lea (%rdi), %rax # endif # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $1, %r8 lea 1(%rdi), %rdi jnz L(StrncpyFillTailWithZero) # endif VZEROUPPER ret .p2align 4 L(CopyTwoVecSize1): add $VEC_SIZE, %rsi add $VEC_SIZE, %rdi # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $VEC_SIZE, %r8 # endif jmp L(CopyVecSizeTail1) .p2align 4 L(CopyTwoVecSize): bsf %edx, %edx add %rcx, %rsi add $VEC_SIZE, %edx sub %ecx, %edx jmp L(CopyVecSizeExit) .p2align 4 L(CopyVecSizeUnaligned_0): bsf %edx, %edx # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT # ifdef USE_AS_STPCPY lea (%rdi, %rdx), %rax # endif vmovdqu %ymm4, (%rdi) add $((VEC_SIZE * 4) - 1), %r8 sub %rdx, %r8 lea 1(%rdi, %rdx), %rdi jmp L(StrncpyFillTailWithZero) # else jmp L(CopyVecSizeExit) # endif .p2align 4 L(CopyVecSizeUnaligned_16): bsf %ecx, %edx vmovdqu %ymm4, (%rdi) # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT # ifdef USE_AS_STPCPY lea VEC_SIZE(%rdi, %rdx), %rax # endif vmovdqu %ymm5, VEC_SIZE(%rdi) add $((VEC_SIZE * 3) - 1), %r8 sub %rdx, %r8 lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi jmp L(StrncpyFillTailWithZero) # else add $VEC_SIZE, %rsi add $VEC_SIZE, %rdi jmp L(CopyVecSizeExit) # endif .p2align 4 L(CopyVecSizeUnaligned_32): bsf %edx, %edx vmovdqu %ymm4, (%rdi) vmovdqu %ymm5, VEC_SIZE(%rdi) # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT # ifdef USE_AS_STPCPY lea (VEC_SIZE * 2)(%rdi, %rdx), %rax # endif vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) add $((VEC_SIZE * 2) - 1), %r8 sub %rdx, %r8 lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi jmp L(StrncpyFillTailWithZero) # else add $(VEC_SIZE * 2), %rsi add $(VEC_SIZE * 2), %rdi jmp L(CopyVecSizeExit) # endif # ifdef USE_AS_STRNCPY # ifndef USE_AS_STRCAT .p2align 4 L(CopyVecSizeUnalignedVec6): vmovdqu %ymm6, (%rdi, %rcx) jmp L(CopyVecSizeVecExit) .p2align 4 L(CopyVecSizeUnalignedVec5): vmovdqu %ymm5, (%rdi, %rcx) jmp L(CopyVecSizeVecExit) .p2align 4 L(CopyVecSizeUnalignedVec4): vmovdqu %ymm4, (%rdi, %rcx) jmp L(CopyVecSizeVecExit) .p2align 4 L(CopyVecSizeUnalignedVec3): vmovdqu %ymm3, (%rdi, %rcx) jmp L(CopyVecSizeVecExit) # endif /* Case2 */ .p2align 4 L(CopyVecSizeCase2): add $VEC_SIZE, %r8 add %rcx, %rdi add %rcx, %rsi bsf %edx, %edx cmp %r8d, %edx jb L(CopyVecSizeExit) jmp L(StrncpyExit) .p2align 4 L(CopyTwoVecSizeCase2): add %rcx, %rsi bsf %edx, %edx add $VEC_SIZE, %edx sub %ecx, %edx cmp %r8d, %edx jb L(CopyVecSizeExit) jmp L(StrncpyExit) L(CopyVecSizeTailCase2): add %rcx, %rsi bsf %edx, %edx cmp %r8d, %edx jb L(CopyVecSizeExit) jmp L(StrncpyExit) L(CopyVecSizeTail1Case2): bsf %edx, %edx cmp %r8d, %edx jb L(CopyVecSizeExit) jmp L(StrncpyExit) /* Case2 or Case3, Case3 */ .p2align 4 L(CopyVecSizeCase2OrCase3): test %rdx, %rdx jnz L(CopyVecSizeCase2) L(CopyVecSizeCase3): add $VEC_SIZE, %r8 add %rcx, %rdi add %rcx, %rsi jmp L(StrncpyExit) .p2align 4 L(CopyTwoVecSizeCase2OrCase3): test %rdx, %rdx jnz L(CopyTwoVecSizeCase2) add %rcx, %rsi jmp L(StrncpyExit) .p2align 4 L(CopyVecSizeTailCase2OrCase3): test %rdx, %rdx jnz L(CopyVecSizeTailCase2) add %rcx, %rsi jmp L(StrncpyExit) .p2align 4 L(CopyTwoVecSize1Case2OrCase3): add $VEC_SIZE, %rdi add $VEC_SIZE, %rsi sub $VEC_SIZE, %r8 L(CopyVecSizeTail1Case2OrCase3): test %rdx, %rdx jnz L(CopyVecSizeTail1Case2) jmp L(StrncpyExit) # endif /*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ .p2align 4 L(Exit1): movzwl (%rsi), %edx mov %dx, (%rdi) # ifdef USE_AS_STPCPY lea 1(%rdi), %rax # endif # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $2, %r8 lea 2(%rdi), %rdi jnz L(StrncpyFillTailWithZero) # endif VZEROUPPER ret .p2align 4 L(Exit2): movzwl (%rsi), %ecx mov %cx, (%rdi) movb $0, 2(%rdi) # ifdef USE_AS_STPCPY lea 2(%rdi), %rax # endif # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $3, %r8 lea 3(%rdi), %rdi jnz L(StrncpyFillTailWithZero) # endif VZEROUPPER ret .p2align 4 L(Exit3): mov (%rsi), %edx mov %edx, (%rdi) # ifdef USE_AS_STPCPY lea 3(%rdi), %rax # endif # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $4, %r8 lea 4(%rdi), %rdi jnz L(StrncpyFillTailWithZero) # endif VZEROUPPER ret .p2align 4 L(Exit4_7): mov (%rsi), %ecx mov %ecx, (%rdi) mov -3(%rsi, %rdx), %ecx mov %ecx, -3(%rdi, %rdx) # ifdef USE_AS_STPCPY lea (%rdi, %rdx), %rax # endif # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub %rdx, %r8 sub $1, %r8 lea 1(%rdi, %rdx), %rdi jnz L(StrncpyFillTailWithZero) # endif VZEROUPPER ret .p2align 4 L(Exit8_15): mov (%rsi), %rcx mov -7(%rsi, %rdx), %r9 mov %rcx, (%rdi) mov %r9, -7(%rdi, %rdx) # ifdef USE_AS_STPCPY lea (%rdi, %rdx), %rax # endif # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub %rdx, %r8 sub $1, %r8 lea 1(%rdi, %rdx), %rdi jnz L(StrncpyFillTailWithZero) # endif VZEROUPPER ret .p2align 4 L(Exit16_31): vmovdqu (%rsi), %xmm2 vmovdqu -15(%rsi, %rdx), %xmm3 vmovdqu %xmm2, (%rdi) vmovdqu %xmm3, -15(%rdi, %rdx) # ifdef USE_AS_STPCPY lea (%rdi, %rdx), %rax # endif # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub %rdx, %r8 sub $1, %r8 lea 1(%rdi, %rdx), %rdi jnz L(StrncpyFillTailWithZero) # endif VZEROUPPER ret .p2align 4 L(Exit32_63): vmovdqu (%rsi), %ymm2 vmovdqu -31(%rsi, %rdx), %ymm3 vmovdqu %ymm2, (%rdi) vmovdqu %ymm3, -31(%rdi, %rdx) # ifdef USE_AS_STPCPY lea (%rdi, %rdx), %rax # endif # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub %rdx, %r8 sub $1, %r8 lea 1(%rdi, %rdx), %rdi jnz L(StrncpyFillTailWithZero) # endif VZEROUPPER ret # ifdef USE_AS_STRNCPY .p2align 4 L(StrncpyExit1): movzbl (%rsi), %edx mov %dl, (%rdi) # ifdef USE_AS_STPCPY lea 1(%rdi), %rax # endif # ifdef USE_AS_STRCAT movb $0, 1(%rdi) # endif VZEROUPPER ret .p2align 4 L(StrncpyExit2): movzwl (%rsi), %edx mov %dx, (%rdi) # ifdef USE_AS_STPCPY lea 2(%rdi), %rax # endif # ifdef USE_AS_STRCAT movb $0, 2(%rdi) # endif VZEROUPPER ret .p2align 4 L(StrncpyExit3_4): movzwl (%rsi), %ecx movzwl -2(%rsi, %r8), %edx mov %cx, (%rdi) mov %dx, -2(%rdi, %r8) # ifdef USE_AS_STPCPY lea (%rdi, %r8), %rax # endif # ifdef USE_AS_STRCAT movb $0, (%rdi, %r8) # endif VZEROUPPER ret .p2align 4 L(StrncpyExit5_8): mov (%rsi), %ecx mov -4(%rsi, %r8), %edx mov %ecx, (%rdi) mov %edx, -4(%rdi, %r8) # ifdef USE_AS_STPCPY lea (%rdi, %r8), %rax # endif # ifdef USE_AS_STRCAT movb $0, (%rdi, %r8) # endif VZEROUPPER ret .p2align 4 L(StrncpyExit9_16): mov (%rsi), %rcx mov -8(%rsi, %r8), %rdx mov %rcx, (%rdi) mov %rdx, -8(%rdi, %r8) # ifdef USE_AS_STPCPY lea (%rdi, %r8), %rax # endif # ifdef USE_AS_STRCAT movb $0, (%rdi, %r8) # endif VZEROUPPER ret .p2align 4 L(StrncpyExit17_32): vmovdqu (%rsi), %xmm2 vmovdqu -16(%rsi, %r8), %xmm3 vmovdqu %xmm2, (%rdi) vmovdqu %xmm3, -16(%rdi, %r8) # ifdef USE_AS_STPCPY lea (%rdi, %r8), %rax # endif # ifdef USE_AS_STRCAT movb $0, (%rdi, %r8) # endif VZEROUPPER ret .p2align 4 L(StrncpyExit33_64): /* 0/32, 31/16 */ vmovdqu (%rsi), %ymm2 vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3 vmovdqu %ymm2, (%rdi) vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8) # ifdef USE_AS_STPCPY lea (%rdi, %r8), %rax # endif # ifdef USE_AS_STRCAT movb $0, (%rdi, %r8) # endif VZEROUPPER ret .p2align 4 L(StrncpyExit65): /* 0/32, 32/32, 64/1 */ vmovdqu (%rsi), %ymm2 vmovdqu 32(%rsi), %ymm3 mov 64(%rsi), %cl vmovdqu %ymm2, (%rdi) vmovdqu %ymm3, 32(%rdi) mov %cl, 64(%rdi) # ifdef USE_AS_STPCPY lea 65(%rdi), %rax # endif # ifdef USE_AS_STRCAT movb $0, 65(%rdi) # endif VZEROUPPER ret # ifndef USE_AS_STRCAT .p2align 4 L(Fill1): mov %dl, (%rdi) VZEROUPPER ret .p2align 4 L(Fill2): mov %dx, (%rdi) VZEROUPPER ret .p2align 4 L(Fill3_4): mov %dx, (%rdi) mov %dx, -2(%rdi, %r8) VZEROUPPER ret .p2align 4 L(Fill5_8): mov %edx, (%rdi) mov %edx, -4(%rdi, %r8) VZEROUPPER ret .p2align 4 L(Fill9_16): mov %rdx, (%rdi) mov %rdx, -8(%rdi, %r8) VZEROUPPER ret .p2align 4 L(Fill17_32): vmovdqu %xmmZ, (%rdi) vmovdqu %xmmZ, -16(%rdi, %r8) VZEROUPPER ret .p2align 4 L(CopyVecSizeUnalignedVec2): vmovdqu %ymm2, (%rdi, %rcx) .p2align 4 L(CopyVecSizeVecExit): bsf %edx, %edx add $(VEC_SIZE - 1), %r8 add %rcx, %rdi # ifdef USE_AS_STPCPY lea (%rdi, %rdx), %rax # endif sub %rdx, %r8 lea 1(%rdi, %rdx), %rdi .p2align 4 L(StrncpyFillTailWithZero): xor %edx, %edx sub $VEC_SIZE, %r8 jbe L(StrncpyFillExit) vmovdqu %ymmZ, (%rdi) add $VEC_SIZE, %rdi mov %rdi, %rsi and $(VEC_SIZE - 1), %esi sub %rsi, %rdi add %rsi, %r8 sub $(VEC_SIZE * 4), %r8 jb L(StrncpyFillLessFourVecSize) L(StrncpyFillLoopVmovdqa): vmovdqa %ymmZ, (%rdi) vmovdqa %ymmZ, VEC_SIZE(%rdi) vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi) vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi) add $(VEC_SIZE * 4), %rdi sub $(VEC_SIZE * 4), %r8 jae L(StrncpyFillLoopVmovdqa) L(StrncpyFillLessFourVecSize): add $(VEC_SIZE * 2), %r8 jl L(StrncpyFillLessTwoVecSize) vmovdqa %ymmZ, (%rdi) vmovdqa %ymmZ, VEC_SIZE(%rdi) add $(VEC_SIZE * 2), %rdi sub $VEC_SIZE, %r8 jl L(StrncpyFillExit) vmovdqa %ymmZ, (%rdi) add $VEC_SIZE, %rdi jmp L(Fill) .p2align 4 L(StrncpyFillLessTwoVecSize): add $VEC_SIZE, %r8 jl L(StrncpyFillExit) vmovdqa %ymmZ, (%rdi) add $VEC_SIZE, %rdi jmp L(Fill) .p2align 4 L(StrncpyFillExit): add $VEC_SIZE, %r8 L(Fill): cmp $17, %r8d jae L(Fill17_32) cmp $9, %r8d jae L(Fill9_16) cmp $5, %r8d jae L(Fill5_8) cmp $3, %r8d jae L(Fill3_4) cmp $1, %r8d ja L(Fill2) je L(Fill1) VZEROUPPER ret /* end of ifndef USE_AS_STRCAT */ # endif .p2align 4 L(UnalignedLeaveCase2OrCase3): test %rdx, %rdx jnz L(UnalignedFourVecSizeLeaveCase2) L(UnalignedFourVecSizeLeaveCase3): lea (VEC_SIZE * 4)(%r8), %rcx and $-VEC_SIZE, %rcx add $(VEC_SIZE * 3), %r8 jl L(CopyVecSizeCase3) vmovdqu %ymm4, (%rdi) sub $VEC_SIZE, %r8 jb L(CopyVecSizeCase3) vmovdqu %ymm5, VEC_SIZE(%rdi) sub $VEC_SIZE, %r8 jb L(CopyVecSizeCase3) vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) sub $VEC_SIZE, %r8 jb L(CopyVecSizeCase3) vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi) # ifdef USE_AS_STPCPY lea (VEC_SIZE * 4)(%rdi), %rax # endif # ifdef USE_AS_STRCAT movb $0, (VEC_SIZE * 4)(%rdi) # endif VZEROUPPER ret .p2align 4 L(UnalignedFourVecSizeLeaveCase2): xor %ecx, %ecx vpcmpeqb %ymm4, %ymmZ, %ymmM vpmovmskb %ymmM, %edx add $(VEC_SIZE * 3), %r8 jle L(CopyVecSizeCase2OrCase3) test %edx, %edx # ifndef USE_AS_STRCAT jnz L(CopyVecSizeUnalignedVec4) # else jnz L(CopyVecSize) # endif vpcmpeqb %ymm5, %ymmZ, %ymmM vpmovmskb %ymmM, %edx vmovdqu %ymm4, (%rdi) add $VEC_SIZE, %rcx sub $VEC_SIZE, %r8 jbe L(CopyVecSizeCase2OrCase3) test %edx, %edx # ifndef USE_AS_STRCAT jnz L(CopyVecSizeUnalignedVec5) # else jnz L(CopyVecSize) # endif vpcmpeqb %ymm6, %ymmZ, %ymmM vpmovmskb %ymmM, %edx vmovdqu %ymm5, VEC_SIZE(%rdi) add $VEC_SIZE, %rcx sub $VEC_SIZE, %r8 jbe L(CopyVecSizeCase2OrCase3) test %edx, %edx # ifndef USE_AS_STRCAT jnz L(CopyVecSizeUnalignedVec6) # else jnz L(CopyVecSize) # endif vpcmpeqb %ymm7, %ymmZ, %ymmM vpmovmskb %ymmM, %edx vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) lea VEC_SIZE(%rdi, %rcx), %rdi lea VEC_SIZE(%rsi, %rcx), %rsi bsf %edx, %edx cmp %r8d, %edx jb L(CopyVecSizeExit) L(StrncpyExit): cmp $65, %r8d je L(StrncpyExit65) cmp $33, %r8d jae L(StrncpyExit33_64) cmp $17, %r8d jae L(StrncpyExit17_32) cmp $9, %r8d jae L(StrncpyExit9_16) cmp $5, %r8d jae L(StrncpyExit5_8) cmp $3, %r8d jae L(StrncpyExit3_4) cmp $1, %r8d ja L(StrncpyExit2) je L(StrncpyExit1) # ifdef USE_AS_STPCPY mov %rdi, %rax # endif # ifdef USE_AS_STRCAT movb $0, (%rdi) # endif VZEROUPPER ret .p2align 4 L(ExitZero): # ifndef USE_AS_STRCAT mov %rdi, %rax # endif VZEROUPPER ret # endif # ifndef USE_AS_STRCAT END (STRCPY) # else END (STRCAT) # endif #endif