summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog21
-rw-r--r--sysdeps/x86_64/multiarch/Makefile3
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-impl-list.c12
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-strcpy.h (renamed from sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h)6
-rw-r--r--sysdeps/x86_64/multiarch/stpcpy-avx2.S3
-rw-r--r--sysdeps/x86_64/multiarch/stpcpy.c2
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy-avx2.S4
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy.c2
-rw-r--r--sysdeps/x86_64/multiarch/strcat-avx2.S275
-rw-r--r--sysdeps/x86_64/multiarch/strcat.c2
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-avx2.S1022
-rw-r--r--sysdeps/x86_64/multiarch/strcpy.c2
-rw-r--r--sysdeps/x86_64/multiarch/strncat-avx2.S3
-rw-r--r--sysdeps/x86_64/multiarch/strncat.c2
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-avx2.S3
-rw-r--r--sysdeps/x86_64/multiarch/strncpy.c2
16 files changed, 1358 insertions, 6 deletions
diff --git a/ChangeLog b/ChangeLog
index 101a267ebc..3a94b71fe7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,24 @@
+2019-01-14 Leonardo Sandoval <leonardo.sandoval.gonzalez@intel.com>
+
+ * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+ strcat-avx2, strncat-avx2, strcpy-avx2, strncpy-avx2,
+ stpcpy-avx2 and stpncpy-avx2.
+ * sysdeps/x86_64/multiarch/ifunc-impl-list.c:
+ (__libc_ifunc_impl_list): Add tests for __strcat_avx2,
+ __strncat_avx2, __strcpy_avx2, __strncpy_avx2, __stpcpy_avx2
+ and __stpncpy_avx2.
+ * sysdeps/x86_64/multiarch/{ifunc-unaligned-ssse3.h =>
+ ifunc-strcpy.h}: rename header for a more generic name.
+ * sysdeps/x86_64/multiarch/ifunc-strcpy.h:
+ (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX 2 machines if
+ AVX unaligned load is fast and vzeroupper is preferred.
+ * sysdeps/x86_64/multiarch/stpcpy-avx2.S: New file.
+ * sysdeps/x86_64/multiarch/stpncpy-avx2.S: Likewise.
+ * sysdeps/x86_64/multiarch/strcat-avx2.S: Likewise.
+ * sysdeps/x86_64/multiarch/strcpy-avx2.S: Likewise.
+ * sysdeps/x86_64/multiarch/strncat-avx2.S: Likewise.
+ * sysdeps/x86_64/multiarch/strncpy-avx2.S: Likewise.
+
2019-01-12 Dmitry V. Levin <ldv@altlinux.org>
* argp/argp-help.c: Fix typo in comment.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index bb5e970735..395e432c09 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -24,11 +24,14 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \
strrchr-sse2 strrchr-avx2 \
strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \
+ strcat-avx2 strncat-avx2 \
strcat-ssse3 strncat-ssse3\
+ strcpy-avx2 strncpy-avx2 \
strcpy-sse2 stpcpy-sse2 \
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
strcpy-sse2-unaligned strncpy-sse2-unaligned \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
+ stpcpy-avx2 stpncpy-avx2 \
strcat-sse2 \
strcat-sse2-unaligned strncat-sse2-unaligned \
strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index cee2aebaee..39a45d0742 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -199,6 +199,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, stpncpy,
IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
__stpncpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, stpncpy, HAS_ARCH_FEATURE (AVX2_Usable),
+ __stpncpy_avx2)
IFUNC_IMPL_ADD (array, i, stpncpy, 1,
__stpncpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
@@ -207,6 +209,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, stpcpy,
IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3),
__stpcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, stpcpy, HAS_ARCH_FEATURE (AVX2_Usable),
+ __stpcpy_avx2)
IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2))
@@ -239,6 +243,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcat.c. */
IFUNC_IMPL (i, name, strcat,
+ IFUNC_IMPL_ADD (array, i, strcat, HAS_ARCH_FEATURE (AVX2_Usable),
+ __strcat_avx2)
IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3),
__strcat_ssse3)
IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
@@ -280,6 +286,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcpy.c. */
IFUNC_IMPL (i, name, strcpy,
+ IFUNC_IMPL_ADD (array, i, strcpy, HAS_ARCH_FEATURE (AVX2_Usable),
+ __strcpy_avx2)
IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3),
__strcpy_ssse3)
IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
@@ -321,6 +329,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncat.c. */
IFUNC_IMPL (i, name, strncat,
+ IFUNC_IMPL_ADD (array, i, strncat, HAS_ARCH_FEATURE (AVX2_Usable),
+ __strncat_avx2)
IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3),
__strncat_ssse3)
IFUNC_IMPL_ADD (array, i, strncat, 1,
@@ -329,6 +339,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncpy.c. */
IFUNC_IMPL (i, name, strncpy,
+ IFUNC_IMPL_ADD (array, i, strncpy, HAS_ARCH_FEATURE (AVX2_Usable),
+ __strncpy_avx2)
IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3),
__strncpy_ssse3)
IFUNC_IMPL_ADD (array, i, strncpy, 1,
diff --git a/sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
index 4ddca9f15f..bc6a70fc7c 100644
--- a/sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
@@ -24,12 +24,18 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ return OPTIMIZE (avx2);
+
if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
return OPTIMIZE (sse2_unaligned);
diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2.S b/sysdeps/x86_64/multiarch/stpcpy-avx2.S
new file mode 100644
index 0000000000..f0bd3029fe
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpcpy-avx2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_avx2
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/stpcpy.c b/sysdeps/x86_64/multiarch/stpcpy.c
index 0b12f5d7a1..fabe03204b 100644
--- a/sysdeps/x86_64/multiarch/stpcpy.c
+++ b/sysdeps/x86_64/multiarch/stpcpy.c
@@ -28,7 +28,7 @@
# undef __stpcpy
# define SYMBOL_NAME stpcpy
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-strcpy.h"
libc_ifunc_redirected (__redirect_stpcpy, __stpcpy, IFUNC_SELECTOR ());
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2.S b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
new file mode 100644
index 0000000000..032b0407d0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_avx2
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c
index 71edc291e4..3b618e7491 100644
--- a/sysdeps/x86_64/multiarch/stpncpy.c
+++ b/sysdeps/x86_64/multiarch/stpncpy.c
@@ -26,7 +26,7 @@
# undef __stpncpy
# define SYMBOL_NAME stpncpy
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-strcpy.h"
libc_ifunc_redirected (__redirect_stpncpy, __stpncpy, IFUNC_SELECTOR ());
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
new file mode 100644
index 0000000000..b062356427
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
@@ -0,0 +1,275 @@
+/* strcat with AVX2
+ Copyright (C) 2011-2018 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRCAT
+# define STRCAT __strcat_avx2
+# endif
+
+# define USE_AS_STRCAT
+
+/* Number of bytes in a vector register */
+# define VEC_SIZE 32
+
+ .section .text.avx,"ax",@progbits
+ENTRY (STRCAT)
+ mov %rdi, %r9
+# ifdef USE_AS_STRNCAT
+ mov %rdx, %r8
+# endif
+
+ xor %eax, %eax
+ mov %edi, %ecx
+ and $((VEC_SIZE * 4) - 1), %ecx
+ vpxor %xmm6, %xmm6, %xmm6
+ cmp $(VEC_SIZE * 3), %ecx
+ ja L(fourth_vector_boundary)
+ vpcmpeqb (%rdi), %ymm6, %ymm0
+ vpmovmskb %ymm0, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_first_vector)
+ mov %rdi, %rax
+ and $-VEC_SIZE, %rax
+ jmp L(align_vec_size_start)
+L(fourth_vector_boundary):
+ mov %rdi, %rax
+ and $-VEC_SIZE, %rax
+ vpcmpeqb (%rax), %ymm6, %ymm0
+ mov $-1, %r10d
+ sub %rax, %rcx
+ shl %cl, %r10d
+ vpmovmskb %ymm0, %edx
+ and %r10d, %edx
+ jnz L(exit)
+
+L(align_vec_size_start):
+ vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
+ vpmovmskb %ymm0, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_second_vector)
+
+ vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
+ vpmovmskb %ymm1, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_third_vector)
+
+ vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
+ vpmovmskb %ymm2, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fourth_vector)
+
+ vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
+ vpmovmskb %ymm3, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fifth_vector)
+
+ vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
+ add $(VEC_SIZE * 4), %rax
+ vpmovmskb %ymm0, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_second_vector)
+
+ vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
+ vpmovmskb %ymm1, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_third_vector)
+
+ vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
+ vpmovmskb %ymm2, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fourth_vector)
+
+ vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
+ vpmovmskb %ymm3, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fifth_vector)
+
+ vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
+ add $(VEC_SIZE * 4), %rax
+ vpmovmskb %ymm0, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_second_vector)
+
+ vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
+ vpmovmskb %ymm1, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_third_vector)
+
+ vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
+ vpmovmskb %ymm2, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fourth_vector)
+
+ vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
+ vpmovmskb %ymm3, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fifth_vector)
+
+ vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
+ add $(VEC_SIZE * 4), %rax
+ vpmovmskb %ymm0, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_second_vector)
+
+ vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
+ vpmovmskb %ymm1, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_third_vector)
+
+ vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
+ vpmovmskb %ymm2, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fourth_vector)
+
+ vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
+ vpmovmskb %ymm3, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fifth_vector)
+
+ test $((VEC_SIZE * 4) - 1), %rax
+ jz L(align_four_vec_loop)
+
+ vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
+ add $(VEC_SIZE * 5), %rax
+ vpmovmskb %ymm0, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ test $((VEC_SIZE * 4) - 1), %rax
+ jz L(align_four_vec_loop)
+
+ vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
+ add $VEC_SIZE, %rax
+ vpmovmskb %ymm1, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ test $((VEC_SIZE * 4) - 1), %rax
+ jz L(align_four_vec_loop)
+
+ vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
+ add $VEC_SIZE, %rax
+ vpmovmskb %ymm2, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ test $((VEC_SIZE * 4) - 1), %rax
+ jz L(align_four_vec_loop)
+
+ vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
+ add $VEC_SIZE, %rax
+ vpmovmskb %ymm3, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ add $VEC_SIZE, %rax
+
+ .p2align 4
+L(align_four_vec_loop):
+ vmovaps (%rax), %ymm4
+ vpminub VEC_SIZE(%rax), %ymm4, %ymm4
+ vmovaps (VEC_SIZE * 2)(%rax), %ymm5
+ vpminub (VEC_SIZE * 3)(%rax), %ymm5, %ymm5
+ add $(VEC_SIZE * 4), %rax
+ vpminub %ymm4, %ymm5, %ymm5
+ vpcmpeqb %ymm5, %ymm6, %ymm5
+ vpmovmskb %ymm5, %edx
+ test %edx, %edx
+ jz L(align_four_vec_loop)
+
+ vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
+ sub $(VEC_SIZE * 5), %rax
+ vpmovmskb %ymm0, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_second_vector)
+
+ vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
+ vpmovmskb %ymm1, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_third_vector)
+
+ vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
+ vpmovmskb %ymm2, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fourth_vector)
+
+ vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
+ vpmovmskb %ymm3, %edx
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $(VEC_SIZE * 4), %rax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit):
+ sub %rdi, %rax
+L(exit_null_on_first_vector):
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_null_on_second_vector):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $VEC_SIZE, %rax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_null_on_third_vector):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $(VEC_SIZE * 2), %rax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_null_on_fourth_vector):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $(VEC_SIZE * 3), %rax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_null_on_fifth_vector):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $(VEC_SIZE * 4), %rax
+
+ .p2align 4
+L(StartStrcpyPart):
+ lea (%r9, %rax), %rdi
+ mov %rsi, %rcx
+ mov %r9, %rax /* save result */
+
+# ifdef USE_AS_STRNCAT
+ test %r8, %r8
+ jz L(ExitZero)
+# define USE_AS_STRNCPY
+# endif
+
+# include "strcpy-avx2.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcat.c b/sysdeps/x86_64/multiarch/strcat.c
index a0341b9514..92c360afc4 100644
--- a/sysdeps/x86_64/multiarch/strcat.c
+++ b/sysdeps/x86_64/multiarch/strcat.c
@@ -24,7 +24,7 @@
# undef strcat
# define SYMBOL_NAME strcat
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-strcpy.h"
libc_ifunc_redirected (__redirect_strcat, strcat, IFUNC_SELECTOR ());
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
new file mode 100644
index 0000000000..81677f9060
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -0,0 +1,1022 @@
+/* strcpy with AVX2
+ Copyright (C) 2011-2018 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+# include <sysdep.h>
+
+# ifndef STRCPY
+# define STRCPY __strcpy_avx2
+# endif
+
+# endif
+
+/* Number of bytes in a vector register */
+# ifndef VEC_SIZE
+# define VEC_SIZE 32
+# endif
+
+# ifndef VZEROUPPER
+# define VZEROUPPER vzeroupper
+# endif
+
+/* zero register */
+#define xmmZ xmm0
+#define ymmZ ymm0
+
+/* mask register */
+#define ymmM ymm1
+
+# ifndef USE_AS_STRCAT
+
+ .section .text.avx,"ax",@progbits
+ENTRY (STRCPY)
+# ifdef USE_AS_STRNCPY
+ mov %rdx, %r8
+ test %r8, %r8
+ jz L(ExitZero)
+# endif
+ mov %rsi, %rcx
+# ifndef USE_AS_STPCPY
+ mov %rdi, %rax /* save result */
+# endif
+
+# endif
+
+ vpxor %xmmZ, %xmmZ, %xmmZ
+
+ and $((VEC_SIZE * 4) - 1), %ecx
+ cmp $(VEC_SIZE * 2), %ecx
+ jbe L(SourceStringAlignmentLessTwoVecSize)
+
+ and $-VEC_SIZE, %rsi
+ and $(VEC_SIZE - 1), %ecx
+
+ vpcmpeqb (%rsi), %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ shr %cl, %rdx
+
+# ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+ mov $VEC_SIZE, %r10
+ sub %rcx, %r10
+ cmp %r10, %r8
+# else
+ mov $(VEC_SIZE + 1), %r10
+ sub %rcx, %r10
+ cmp %r10, %r8
+# endif
+ jbe L(CopyVecSizeTailCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyVecSizeTail)
+
+ vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
+ vpmovmskb %ymm2, %edx
+
+# ifdef USE_AS_STRNCPY
+ add $VEC_SIZE, %r10
+ cmp %r10, %r8
+ jbe L(CopyTwoVecSizeCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyTwoVecSize)
+
+ vmovdqu (%rsi, %rcx), %ymm2 /* copy VEC_SIZE bytes */
+ vmovdqu %ymm2, (%rdi)
+
+/* If source address alignment != destination address alignment */
+ .p2align 4
+L(UnalignVecSizeBoth):
+ sub %rcx, %rdi
+# ifdef USE_AS_STRNCPY
+ add %rcx, %r8
+ sbb %rcx, %rcx
+ or %rcx, %r8
+# endif
+ mov $VEC_SIZE, %rcx
+ vmovdqa (%rsi, %rcx), %ymm2
+ vmovdqu %ymm2, (%rdi, %rcx)
+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
+ vpcmpeqb %ymm2, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ add $VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $(VEC_SIZE * 3), %r8
+ jbe L(CopyVecSizeCase2OrCase3)
+# endif
+ test %edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec2)
+# else
+ jnz L(CopyVecSize)
+# endif
+
+ vmovdqu %ymm2, (%rdi, %rcx)
+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
+ vpcmpeqb %ymm3, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ add $VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $VEC_SIZE, %r8
+ jbe L(CopyVecSizeCase2OrCase3)
+# endif
+ test %edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec3)
+# else
+ jnz L(CopyVecSize)
+# endif
+
+ vmovdqu %ymm3, (%rdi, %rcx)
+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
+ vpcmpeqb %ymm4, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ add $VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $VEC_SIZE, %r8
+ jbe L(CopyVecSizeCase2OrCase3)
+# endif
+ test %edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec4)
+# else
+ jnz L(CopyVecSize)
+# endif
+
+ vmovdqu %ymm4, (%rdi, %rcx)
+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
+ vpcmpeqb %ymm2, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ add $VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $VEC_SIZE, %r8
+ jbe L(CopyVecSizeCase2OrCase3)
+# endif
+ test %edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec2)
+# else
+ jnz L(CopyVecSize)
+# endif
+
+ vmovdqu %ymm2, (%rdi, %rcx)
+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
+ vpcmpeqb %ymm2, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ add $VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $VEC_SIZE, %r8
+ jbe L(CopyVecSizeCase2OrCase3)
+# endif
+ test %edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec2)
+# else
+ jnz L(CopyVecSize)
+# endif
+
+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
+ vmovdqu %ymm2, (%rdi, %rcx)
+ vpcmpeqb %ymm3, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ add $VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $VEC_SIZE, %r8
+ jbe L(CopyVecSizeCase2OrCase3)
+# endif
+ test %edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec3)
+# else
+ jnz L(CopyVecSize)
+# endif
+
+ vmovdqu %ymm3, (%rdi, %rcx)
+ mov %rsi, %rdx
+ lea VEC_SIZE(%rsi, %rcx), %rsi
+ and $-(VEC_SIZE * 4), %rsi
+ sub %rsi, %rdx
+ sub %rdx, %rdi
+# ifdef USE_AS_STRNCPY
+ lea (VEC_SIZE * 8)(%r8, %rdx), %r8
+# endif
+L(UnalignedFourVecSizeLoop):
+ vmovdqa (%rsi), %ymm4
+ vmovdqa VEC_SIZE(%rsi), %ymm5
+ vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
+ vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
+ vpminub %ymm5, %ymm4, %ymm2
+ vpminub %ymm7, %ymm6, %ymm3
+ vpminub %ymm2, %ymm3, %ymm3
+ vpcmpeqb %ymmM, %ymm3, %ymm3
+ vpmovmskb %ymm3, %edx
+# ifdef USE_AS_STRNCPY
+ sub $(VEC_SIZE * 4), %r8
+ jbe L(UnalignedLeaveCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(UnalignedFourVecSizeLeave)
+
+L(UnalignedFourVecSizeLoop_start):
+ add $(VEC_SIZE * 4), %rdi
+ add $(VEC_SIZE * 4), %rsi
+ vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
+ vmovdqa (%rsi), %ymm4
+ vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
+ vmovdqa VEC_SIZE(%rsi), %ymm5
+ vpminub %ymm5, %ymm4, %ymm2
+ vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
+ vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
+ vmovdqu %ymm7, -VEC_SIZE(%rdi)
+ vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
+ vpminub %ymm7, %ymm6, %ymm3
+ vpminub %ymm2, %ymm3, %ymm3
+ vpcmpeqb %ymmM, %ymm3, %ymm3
+ vpmovmskb %ymm3, %edx
+# ifdef USE_AS_STRNCPY
+ sub $(VEC_SIZE * 4), %r8
+ jbe L(UnalignedLeaveCase2OrCase3)
+# endif
+ test %edx, %edx
+ jz L(UnalignedFourVecSizeLoop_start)
+
+L(UnalignedFourVecSizeLeave):
+ vpcmpeqb %ymm4, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ test %edx, %edx
+ jnz L(CopyVecSizeUnaligned_0)
+
+ vpcmpeqb %ymm5, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %ecx
+ test %ecx, %ecx
+ jnz L(CopyVecSizeUnaligned_16)
+
+ vpcmpeqb %ymm6, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ test %edx, %edx
+ jnz L(CopyVecSizeUnaligned_32)
+
+ vpcmpeqb %ymm7, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %ecx
+ bsf %ecx, %edx
+ vmovdqu %ymm4, (%rdi)
+ vmovdqu %ymm5, VEC_SIZE(%rdi)
+ vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+ lea (VEC_SIZE * 3)(%rdi, %rdx), %rax
+# endif
+ vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
+ add $(VEC_SIZE - 1), %r8
+ sub %rdx, %r8
+ lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+# else
+ add $(VEC_SIZE * 3), %rsi
+ add $(VEC_SIZE * 3), %rdi
+ jmp L(CopyVecSizeExit)
+# endif
+
+/* If source address alignment == destination address alignment */
+
+L(SourceStringAlignmentLessTwoVecSize):
+ vmovdqu (%rsi), %ymm3
+ vmovdqu VEC_SIZE(%rsi), %ymm2
+ vpcmpeqb %ymm3, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+
+# ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+ cmp $VEC_SIZE, %r8
+# else
+ cmp $(VEC_SIZE + 1), %r8
+# endif
+ jbe L(CopyVecSizeTail1Case2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyVecSizeTail1)
+
+ vmovdqu %ymm3, (%rdi)
+ vpcmpeqb %ymm2, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+
+# ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+ cmp $(VEC_SIZE * 2), %r8
+# else
+ cmp $((VEC_SIZE * 2) + 1), %r8
+# endif
+ jbe L(CopyTwoVecSize1Case2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyTwoVecSize1)
+
+ and $-VEC_SIZE, %rsi
+ and $(VEC_SIZE - 1), %ecx
+ jmp L(UnalignVecSizeBoth)
+
+/*------End of main part with loops---------------------*/
+
+/* Case1 */
+
+# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
+ .p2align 4
+L(CopyVecSize):
+ add %rcx, %rdi
+# endif
+L(CopyVecSizeTail):
+ add %rcx, %rsi
+L(CopyVecSizeTail1):
+ bsf %edx, %edx
+L(CopyVecSizeExit):
+ cmp $32, %edx
+ jae L(Exit32_63)
+ cmp $16, %edx
+ jae L(Exit16_31)
+ cmp $8, %edx
+ jae L(Exit8_15)
+ cmp $4, %edx
+ jae L(Exit4_7)
+ cmp $3, %edx
+ je L(Exit3)
+ cmp $1, %edx
+ ja L(Exit2)
+ je L(Exit1)
+ movb $0, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea (%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $1, %r8
+ lea 1(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(CopyTwoVecSize1):
+ add $VEC_SIZE, %rsi
+ add $VEC_SIZE, %rdi
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $VEC_SIZE, %r8
+# endif
+ jmp L(CopyVecSizeTail1)
+
+ .p2align 4
+L(CopyTwoVecSize):
+ bsf %edx, %edx
+ add %rcx, %rsi
+ add $VEC_SIZE, %edx
+ sub %ecx, %edx
+ jmp L(CopyVecSizeExit)
+
+ .p2align 4
+L(CopyVecSizeUnaligned_0):
+ bsf %edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+# endif
+ vmovdqu %ymm4, (%rdi)
+ add $((VEC_SIZE * 4) - 1), %r8
+ sub %rdx, %r8
+ lea 1(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+# else
+ jmp L(CopyVecSizeExit)
+# endif
+
+ .p2align 4
+L(CopyVecSizeUnaligned_16):
+ bsf %ecx, %edx
+ vmovdqu %ymm4, (%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+ lea VEC_SIZE(%rdi, %rdx), %rax
+# endif
+ vmovdqu %ymm5, VEC_SIZE(%rdi)
+ add $((VEC_SIZE * 3) - 1), %r8
+ sub %rdx, %r8
+ lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+# else
+ add $VEC_SIZE, %rsi
+ add $VEC_SIZE, %rdi
+ jmp L(CopyVecSizeExit)
+# endif
+
+ .p2align 4
+L(CopyVecSizeUnaligned_32):
+ bsf %edx, %edx
+ vmovdqu %ymm4, (%rdi)
+ vmovdqu %ymm5, VEC_SIZE(%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+ lea (VEC_SIZE * 2)(%rdi, %rdx), %rax
+# endif
+ vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
+ add $((VEC_SIZE * 2) - 1), %r8
+ sub %rdx, %r8
+ lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+# else
+ add $(VEC_SIZE * 2), %rsi
+ add $(VEC_SIZE * 2), %rdi
+ jmp L(CopyVecSizeExit)
+# endif
+
+# ifdef USE_AS_STRNCPY
+# ifndef USE_AS_STRCAT
+ .p2align 4
+L(CopyVecSizeUnalignedVec6):
+ vmovdqu %ymm6, (%rdi, %rcx)
+ jmp L(CopyVecSizeVecExit)
+
+ .p2align 4
+L(CopyVecSizeUnalignedVec5):
+ vmovdqu %ymm5, (%rdi, %rcx)
+ jmp L(CopyVecSizeVecExit)
+
+ .p2align 4
+L(CopyVecSizeUnalignedVec4):
+ vmovdqu %ymm4, (%rdi, %rcx)
+ jmp L(CopyVecSizeVecExit)
+
+ .p2align 4
+L(CopyVecSizeUnalignedVec3):
+ vmovdqu %ymm3, (%rdi, %rcx)
+ jmp L(CopyVecSizeVecExit)
+# endif
+
+/* Case2 */
+
+ .p2align 4
+L(CopyVecSizeCase2):
+ add $VEC_SIZE, %r8
+ add %rcx, %rdi
+ add %rcx, %rsi
+ bsf %edx, %edx
+ cmp %r8d, %edx
+ jb L(CopyVecSizeExit)
+ jmp L(StrncpyExit)
+
+ .p2align 4
+L(CopyTwoVecSizeCase2):
+ add %rcx, %rsi
+ bsf %edx, %edx
+ add $VEC_SIZE, %edx
+ sub %ecx, %edx
+ cmp %r8d, %edx
+ jb L(CopyVecSizeExit)
+ jmp L(StrncpyExit)
+
+L(CopyVecSizeTailCase2):
+ add %rcx, %rsi
+ bsf %edx, %edx
+ cmp %r8d, %edx
+ jb L(CopyVecSizeExit)
+ jmp L(StrncpyExit)
+
+L(CopyVecSizeTail1Case2):
+ bsf %edx, %edx
+ cmp %r8d, %edx
+ jb L(CopyVecSizeExit)
+ jmp L(StrncpyExit)
+
+/* Case2 or Case3, Case3 */
+
+ .p2align 4
+L(CopyVecSizeCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyVecSizeCase2)
+L(CopyVecSizeCase3):
+ add $VEC_SIZE, %r8
+ add %rcx, %rdi
+ add %rcx, %rsi
+ jmp L(StrncpyExit)
+
+ .p2align 4
+L(CopyTwoVecSizeCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyTwoVecSizeCase2)
+ add %rcx, %rsi
+ jmp L(StrncpyExit)
+
+ .p2align 4
+L(CopyVecSizeTailCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyVecSizeTailCase2)
+ add %rcx, %rsi
+ jmp L(StrncpyExit)
+
+ .p2align 4
+L(CopyTwoVecSize1Case2OrCase3):
+ add $VEC_SIZE, %rdi
+ add $VEC_SIZE, %rsi
+ sub $VEC_SIZE, %r8
+L(CopyVecSizeTail1Case2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyVecSizeTail1Case2)
+ jmp L(StrncpyExit)
+# endif
+
+/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
+
+ .p2align 4
+L(Exit1):
+ movzwl (%rsi), %edx
+ mov %dx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 1(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $2, %r8
+ lea 2(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit2):
+ movzwl (%rsi), %ecx
+ mov %cx, (%rdi)
+ movb $0, 2(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 2(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $3, %r8
+ lea 3(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit3):
+ mov (%rsi), %edx
+ mov %edx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 3(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $4, %r8
+ lea 4(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit4_7):
+ mov (%rsi), %ecx
+ mov %ecx, (%rdi)
+ mov -3(%rsi, %rdx), %ecx
+ mov %ecx, -3(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub %rdx, %r8
+ sub $1, %r8
+ lea 1(%rdi, %rdx), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit8_15):
+ mov (%rsi), %rcx
+ mov -7(%rsi, %rdx), %r9
+ mov %rcx, (%rdi)
+ mov %r9, -7(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub %rdx, %r8
+ sub $1, %r8
+ lea 1(%rdi, %rdx), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit16_31):
+ vmovdqu (%rsi), %xmm2
+ vmovdqu -15(%rsi, %rdx), %xmm3
+ vmovdqu %xmm2, (%rdi)
+ vmovdqu %xmm3, -15(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub %rdx, %r8
+ sub $1, %r8
+ lea 1(%rdi, %rdx), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit32_63):
+ vmovdqu (%rsi), %ymm2
+ vmovdqu -31(%rsi, %rdx), %ymm3
+ vmovdqu %ymm2, (%rdi)
+ vmovdqu %ymm3, -31(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub %rdx, %r8
+ sub $1, %r8
+ lea 1(%rdi, %rdx), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ VZEROUPPER
+ ret
+
+# ifdef USE_AS_STRNCPY
+
+ .p2align 4
+L(StrncpyExit1):
+ movzbl (%rsi), %edx
+ mov %dl, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 1(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ movb $0, 1(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit2):
+ movzwl (%rsi), %edx
+ mov %dx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 2(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ movb $0, 2(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit3_4):
+ movzwl (%rsi), %ecx
+ movzwl -2(%rsi, %r8), %edx
+ mov %cx, (%rdi)
+ mov %dx, -2(%rdi, %r8)
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %r8), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ movb $0, (%rdi, %r8)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit5_8):
+ mov (%rsi), %ecx
+ mov -4(%rsi, %r8), %edx
+ mov %ecx, (%rdi)
+ mov %edx, -4(%rdi, %r8)
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %r8), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ movb $0, (%rdi, %r8)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit9_16):
+ mov (%rsi), %rcx
+ mov -8(%rsi, %r8), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, -8(%rdi, %r8)
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %r8), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ movb $0, (%rdi, %r8)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit17_32):
+ vmovdqu (%rsi), %xmm2
+ vmovdqu -16(%rsi, %r8), %xmm3
+ vmovdqu %xmm2, (%rdi)
+ vmovdqu %xmm3, -16(%rdi, %r8)
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %r8), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ movb $0, (%rdi, %r8)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit33_64):
+ /* 0/32, 31/16 */
+ vmovdqu (%rsi), %ymm2
+ vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3
+ vmovdqu %ymm2, (%rdi)
+ vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8)
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %r8), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ movb $0, (%rdi, %r8)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(StrncpyExit65):
+ /* 0/32, 32/32, 64/1 */
+ vmovdqu (%rsi), %ymm2
+ vmovdqu 32(%rsi), %ymm3
+ mov 64(%rsi), %cl
+ vmovdqu %ymm2, (%rdi)
+ vmovdqu %ymm3, 32(%rdi)
+ mov %cl, 64(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 65(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ movb $0, 65(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+# ifndef USE_AS_STRCAT
+
+ .p2align 4
+L(Fill1):
+ mov %dl, (%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill2):
+ mov %dx, (%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill3_4):
+ mov %dx, (%rdi)
+ mov %dx, -2(%rdi, %r8)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill5_8):
+ mov %edx, (%rdi)
+ mov %edx, -4(%rdi, %r8)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill9_16):
+ mov %rdx, (%rdi)
+ mov %rdx, -8(%rdi, %r8)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Fill17_32):
+ vmovdqu %xmmZ, (%rdi)
+ vmovdqu %xmmZ, -16(%rdi, %r8)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(CopyVecSizeUnalignedVec2):
+ vmovdqu %ymm2, (%rdi, %rcx)
+
+ .p2align 4
+L(CopyVecSizeVecExit):
+ bsf %edx, %edx
+ add $(VEC_SIZE - 1), %r8
+ add %rcx, %rdi
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+# endif
+ sub %rdx, %r8
+ lea 1(%rdi, %rdx), %rdi
+
+ .p2align 4
+L(StrncpyFillTailWithZero):
+ xor %edx, %edx
+ sub $VEC_SIZE, %r8
+ jbe L(StrncpyFillExit)
+
+ vmovdqu %ymmZ, (%rdi)
+ add $VEC_SIZE, %rdi
+
+ mov %rdi, %rsi
+ and $(VEC_SIZE - 1), %esi
+ sub %rsi, %rdi
+ add %rsi, %r8
+ sub $(VEC_SIZE * 4), %r8
+ jb L(StrncpyFillLessFourVecSize)
+
+L(StrncpyFillLoopVmovdqa):
+ vmovdqa %ymmZ, (%rdi)
+ vmovdqa %ymmZ, VEC_SIZE(%rdi)
+ vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
+ vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
+ add $(VEC_SIZE * 4), %rdi
+ sub $(VEC_SIZE * 4), %r8
+ jae L(StrncpyFillLoopVmovdqa)
+
+L(StrncpyFillLessFourVecSize):
+ add $(VEC_SIZE * 2), %r8
+ jl L(StrncpyFillLessTwoVecSize)
+ vmovdqa %ymmZ, (%rdi)
+ vmovdqa %ymmZ, VEC_SIZE(%rdi)
+ add $(VEC_SIZE * 2), %rdi
+ sub $VEC_SIZE, %r8
+ jl L(StrncpyFillExit)
+ vmovdqa %ymmZ, (%rdi)
+ add $VEC_SIZE, %rdi
+ jmp L(Fill)
+
+ .p2align 4
+L(StrncpyFillLessTwoVecSize):
+ add $VEC_SIZE, %r8
+ jl L(StrncpyFillExit)
+ vmovdqa %ymmZ, (%rdi)
+ add $VEC_SIZE, %rdi
+ jmp L(Fill)
+
+ .p2align 4
+L(StrncpyFillExit):
+ add $VEC_SIZE, %r8
+L(Fill):
+ cmp $17, %r8d
+ jae L(Fill17_32)
+ cmp $9, %r8d
+ jae L(Fill9_16)
+ cmp $5, %r8d
+ jae L(Fill5_8)
+ cmp $3, %r8d
+ jae L(Fill3_4)
+ cmp $1, %r8d
+ ja L(Fill2)
+ je L(Fill1)
+ VZEROUPPER
+ ret
+
+/* end of ifndef USE_AS_STRCAT */
+# endif
+
+ .p2align 4
+L(UnalignedLeaveCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(UnalignedFourVecSizeLeaveCase2)
+L(UnalignedFourVecSizeLeaveCase3):
+ lea (VEC_SIZE * 4)(%r8), %rcx
+ and $-VEC_SIZE, %rcx
+ add $(VEC_SIZE * 3), %r8
+ jl L(CopyVecSizeCase3)
+ vmovdqu %ymm4, (%rdi)
+ sub $VEC_SIZE, %r8
+ jb L(CopyVecSizeCase3)
+ vmovdqu %ymm5, VEC_SIZE(%rdi)
+ sub $VEC_SIZE, %r8
+ jb L(CopyVecSizeCase3)
+ vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
+ sub $VEC_SIZE, %r8
+ jb L(CopyVecSizeCase3)
+ vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
+# ifdef USE_AS_STPCPY
+ lea (VEC_SIZE * 4)(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ movb $0, (VEC_SIZE * 4)(%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(UnalignedFourVecSizeLeaveCase2):
+ xor %ecx, %ecx
+ vpcmpeqb %ymm4, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ add $(VEC_SIZE * 3), %r8
+ jle L(CopyVecSizeCase2OrCase3)
+ test %edx, %edx
+# ifndef USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec4)
+# else
+ jnz L(CopyVecSize)
+# endif
+ vpcmpeqb %ymm5, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ vmovdqu %ymm4, (%rdi)
+ add $VEC_SIZE, %rcx
+ sub $VEC_SIZE, %r8
+ jbe L(CopyVecSizeCase2OrCase3)
+ test %edx, %edx
+# ifndef USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec5)
+# else
+ jnz L(CopyVecSize)
+# endif
+
+ vpcmpeqb %ymm6, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ vmovdqu %ymm5, VEC_SIZE(%rdi)
+ add $VEC_SIZE, %rcx
+ sub $VEC_SIZE, %r8
+ jbe L(CopyVecSizeCase2OrCase3)
+ test %edx, %edx
+# ifndef USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec6)
+# else
+ jnz L(CopyVecSize)
+# endif
+
+ vpcmpeqb %ymm7, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
+ lea VEC_SIZE(%rdi, %rcx), %rdi
+ lea VEC_SIZE(%rsi, %rcx), %rsi
+ bsf %edx, %edx
+ cmp %r8d, %edx
+ jb L(CopyVecSizeExit)
+L(StrncpyExit):
+ cmp $65, %r8d
+ je L(StrncpyExit65)
+ cmp $33, %r8d
+ jae L(StrncpyExit33_64)
+ cmp $17, %r8d
+ jae L(StrncpyExit17_32)
+ cmp $9, %r8d
+ jae L(StrncpyExit9_16)
+ cmp $5, %r8d
+ jae L(StrncpyExit5_8)
+ cmp $3, %r8d
+ jae L(StrncpyExit3_4)
+ cmp $1, %r8d
+ ja L(StrncpyExit2)
+ je L(StrncpyExit1)
+# ifdef USE_AS_STPCPY
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRCAT
+ movb $0, (%rdi)
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(ExitZero):
+# ifndef USE_AS_STRCAT
+ mov %rdi, %rax
+# endif
+ VZEROUPPER
+ ret
+
+# endif
+
+# ifndef USE_AS_STRCAT
+END (STRCPY)
+# else
+END (STRCAT)
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcpy.c b/sysdeps/x86_64/multiarch/strcpy.c
index f125fd12f9..5e079edfca 100644
--- a/sysdeps/x86_64/multiarch/strcpy.c
+++ b/sysdeps/x86_64/multiarch/strcpy.c
@@ -24,7 +24,7 @@
# undef strcpy
# define SYMBOL_NAME strcpy
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-strcpy.h"
libc_ifunc_redirected (__redirect_strcpy, strcpy, IFUNC_SELECTOR ());
diff --git a/sysdeps/x86_64/multiarch/strncat-avx2.S b/sysdeps/x86_64/multiarch/strncat-avx2.S
new file mode 100644
index 0000000000..bfefa659bb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncat-avx2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCAT
+#define STRCAT __strncat_avx2
+#include "strcat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c
index e95f62acac..b6cae40263 100644
--- a/sysdeps/x86_64/multiarch/strncat.c
+++ b/sysdeps/x86_64/multiarch/strncat.c
@@ -24,7 +24,7 @@
# undef strncat
# define SYMBOL_NAME strncat
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-strcpy.h"
libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ());
strong_alias (strncat, __strncat);
diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2.S b/sysdeps/x86_64/multiarch/strncpy-avx2.S
new file mode 100644
index 0000000000..9ef8c87627
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-avx2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_avx2
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c
index daa150cee3..d6fd34671d 100644
--- a/sysdeps/x86_64/multiarch/strncpy.c
+++ b/sysdeps/x86_64/multiarch/strncpy.c
@@ -24,7 +24,7 @@
# undef strncpy
# define SYMBOL_NAME strncpy
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-strcpy.h"
libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ());