2 Copyright (C) 2009-2013 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
23 | _SIDD_CMP_EQUAL_EACH
24 | _SIDD_NEGATIVE_POLARITY
25 | _SIDD_LEAST_SIGNIFICANT
26 on pcmpistri to find out if two 16byte data elements are the same
27 and the offset of the first different byte. There are 4 cases:
29 1. Both 16byte data elements are valid and identical.
30 2. Both 16byte data elements have EOS and identical.
31 3. Both 16byte data elements are valid and they differ at offset X.
32 4. At least one 16byte data element has EOS at offset X. Two 16byte
33 data elements must differ at or before offset X.
35 Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
37 case ECX CFlag ZFlag SFlag
43 We exit from the loop for cases 2, 3 and 4 with jbe which branches
44 when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for
47 /* Put all SSE 4.2 functions together. */
48 .section .text.SECTION,"ax",@progbits
50 .type STRCMP_SSE42, @function
53 #ifdef USE_AS_STRCASECMP_L
54 ENTRY (GLABEL(__strcasecmp))
55 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
56 mov %fs:(%rax),%RDX_LP
58 // XXX 5 byte should be before the function
60 .byte 0x0f,0x1f,0x44,0x00,0x00
61 END (GLABEL(__strcasecmp))
62 /* FALLTHROUGH to strcasecmp_l. */
64 #ifdef USE_AS_STRNCASECMP_L
65 ENTRY (GLABEL(__strncasecmp))
66 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
67 mov %fs:(%rax),%RCX_LP
69 // XXX 5 byte should be before the function
71 .byte 0x0f,0x1f,0x44,0x00,0x00
72 END (GLABEL(__strncasecmp))
73 /* FALLTHROUGH to strncasecmp_l. */
78 # define movdqa vmovdqa
79 # define movdqu vmovdqu
80 # define pmovmskb vpmovmskb
81 # define pcmpistri vpcmpistri
83 # define pcmpeqb vpcmpeqb
84 # define psrldq vpsrldq
85 # define pslldq vpslldq
86 # define palignr vpalignr
88 # define D(arg) arg, arg
98 * This implementation uses SSE to compare up to 16 bytes at a time.
100 #ifdef USE_AS_STRCASECMP_L
101 /* We have to fall back on the C implementation for locales
102 with encodings not matching ASCII for single bytes. */
103 # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
104 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
108 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
109 jne __strcasecmp_l_nonascii
111 #ifdef USE_AS_STRNCASECMP_L
112 /* We have to fall back on the C implementation for locales
113 with encodings not matching ASCII for single bytes. */
114 # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
115 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
119 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
120 jne __strncasecmp_l_nonascii
123 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
125 je LABEL(strcmp_exitz)
132 /* Use 64bit AND here to avoid long NOP padding. */
133 and $0x3f, %rcx /* rsi alignment in cache line */
134 and $0x3f, %rax /* rdi alignment in cache line */
135 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
136 .section .rodata.cst16,"aM",@progbits,16
139 .quad 0x4040404040404040
140 .quad 0x4040404040404040
143 .quad 0x5a5a5a5a5a5a5a5a
144 .quad 0x5a5a5a5a5a5a5a5a
146 .quad 0x5b5b5b5b5b5b5b5b
147 .quad 0x5b5b5b5b5b5b5b5b
150 .quad 0x2020202020202020
151 .quad 0x2020202020202020
153 movdqa LABEL(belowupper)(%rip), %xmm4
154 # define UCLOW_reg %xmm4
155 movdqa LABEL(topupper)(%rip), %xmm5
156 # define UCHIGH_reg %xmm5
157 movdqa LABEL(touppermask)(%rip), %xmm6
158 # define LCQWORD_reg %xmm6
161 ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
163 ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */
166 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
168 # define TOLOWER(reg1, reg2) \
169 vpcmpgtb UCLOW_reg, reg1, %xmm7; \
170 vpcmpgtb UCHIGH_reg, reg1, %xmm8; \
171 vpcmpgtb UCLOW_reg, reg2, %xmm9; \
172 vpcmpgtb UCHIGH_reg, reg2, %xmm10; \
173 vpandn %xmm7, %xmm8, %xmm8; \
174 vpandn %xmm9, %xmm10, %xmm10; \
175 vpand LCQWORD_reg, %xmm8, %xmm8; \
176 vpand LCQWORD_reg, %xmm10, %xmm10; \
177 vpor reg1, %xmm8, reg1; \
178 vpor reg2, %xmm10, reg2
180 # define TOLOWER(reg1, reg2) \
181 movdqa reg1, %xmm7; \
182 movdqa UCHIGH_reg, %xmm8; \
183 movdqa reg2, %xmm9; \
184 movdqa UCHIGH_reg, %xmm10; \
185 pcmpgtb UCLOW_reg, %xmm7; \
186 pcmpgtb reg1, %xmm8; \
187 pcmpgtb UCLOW_reg, %xmm9; \
188 pcmpgtb reg2, %xmm10; \
190 pand %xmm10, %xmm9; \
191 pand LCQWORD_reg, %xmm7; \
192 pand LCQWORD_reg, %xmm9; \
196 TOLOWER (%xmm1, %xmm2)
198 # define TOLOWER(reg1, reg2)
200 pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */
201 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
202 pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */
203 psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
205 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
206 jnz LABEL(less16bytes)/* If not, find different value or null char */
207 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
209 jbe LABEL(strcmp_exitz)/* finish comparision */
211 add $16, %rsi /* prepare to search next 16 bytes */
212 add $16, %rdi /* prepare to search next 16 bytes */
215 * Determine source and destination string offsets from 16-byte
216 * alignment. Use relative offset difference between the two to
217 * determine which case below to use.
221 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
222 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
223 mov $0xffff, %edx /* for equivalent offset */
225 and $0xf, %ecx /* offset of rsi */
226 and $0xf, %eax /* offset of rdi */
227 pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */
229 je LABEL(ashr_0) /* rsi and rdi relative offset same */
231 mov %edx, %r8d /* r8d is offset flag for exit tail */
239 lea LABEL(unaligned_table)(%rip), %r10
240 movslq (%r10, %r9,4), %r9
241 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
242 lea (%r10, %r9), %r10
243 jmp *%r10 /* jump to corresponding case */
246 * The following cases will be handled by ashr_0
247 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
248 * n(0~15) n(0~15) 15(15+ n-n) ashr_0
254 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
255 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
256 pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */
259 TOLOWER (%xmm1, %xmm2)
260 pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */
262 psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
264 shr %cl, %edx /* adjust 0xffff for offset */
265 shr %cl, %r9d /* adjust for 16-byte offset */
268 * edx must be the same with r9d if in left byte (16-rcx) is equal to
269 * the start from (16-rax) and no null char was seen.
271 jne LABEL(less32bytes) /* mismatch or null char */
272 UPDATE_STRNCMP_COUNTER
277 * Now both strings are aligned at 16-byte boundary. Loop over strings
278 * checking 32-bytes per iteration.
280 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
283 movdqa (%rdi,%rdx), %xmm0
284 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
285 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
287 movdqa (%rsi,%rdx), %xmm1
288 TOLOWER (%xmm0, %xmm1)
289 pcmpistri $0x1a, %xmm1, %xmm0
292 jbe LABEL(ashr_0_exit_use)
293 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
295 jbe LABEL(strcmp_exitz)
298 movdqa (%rdi,%rdx), %xmm0
299 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
300 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
302 movdqa (%rsi,%rdx), %xmm1
303 TOLOWER (%xmm0, %xmm1)
304 pcmpistri $0x1a, %xmm1, %xmm0
307 jbe LABEL(ashr_0_exit_use)
308 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
310 jbe LABEL(strcmp_exitz)
312 jmp LABEL(ashr_0_use)
316 LABEL(ashr_0_exit_use):
317 jnc LABEL(strcmp_exitz)
318 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
320 jbe LABEL(strcmp_exitz)
322 lea -16(%rdx, %rcx), %rcx
323 movzbl (%rdi, %rcx), %eax
324 movzbl (%rsi, %rcx), %edx
325 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
326 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
327 movl (%rcx,%rax,4), %eax
328 movl (%rcx,%rdx,4), %edx
336 * The following cases will be handled by ashr_1
337 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
338 * n(15) n -15 0(15 +(n-15) - n) ashr_1
342 pslldq $15, D(%xmm2) /* shift first string to align with second */
343 TOLOWER (%xmm1, %xmm2)
344 pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */
345 psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/
347 shr %cl, %edx /* adjust 0xffff for offset */
348 shr %cl, %r9d /* adjust for 16-byte offset */
350 jnz LABEL(less32bytes) /* mismatch or null char seen */
352 UPDATE_STRNCMP_COUNTER
354 mov $16, %rcx /* index for loads*/
355 mov $1, %r9d /* byte position left over from less32bytes case */
357 * Setup %r10 value allows us to detect crossing a page boundary.
358 * When %r10 goes positive we have crossed a page boundary and
359 * need to do a nibble.
362 and $0xfff, %r10 /* offset into 4K page */
363 sub $0x1000, %r10 /* subtract 4K pagesize */
364 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
367 LABEL(loop_ashr_1_use):
369 jg LABEL(nibble_ashr_1_use)
371 LABEL(nibble_ashr_1_restart_use):
372 movdqa (%rdi, %rdx), %xmm0
373 palignr $1, -16(%rdi, %rdx), D(%xmm0)
374 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
375 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
377 movdqa (%rsi,%rdx), %xmm1
378 TOLOWER (%xmm0, %xmm1)
379 pcmpistri $0x1a, %xmm1, %xmm0
382 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
384 jbe LABEL(strcmp_exitz)
389 jg LABEL(nibble_ashr_1_use)
391 movdqa (%rdi, %rdx), %xmm0
392 palignr $1, -16(%rdi, %rdx), D(%xmm0)
393 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
394 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
396 movdqa (%rsi,%rdx), %xmm1
397 TOLOWER (%xmm0, %xmm1)
398 pcmpistri $0x1a, %xmm1, %xmm0
401 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
403 jbe LABEL(strcmp_exitz)
406 jmp LABEL(loop_ashr_1_use)
409 LABEL(nibble_ashr_1_use):
411 movdqa -16(%rdi, %rdx), %xmm0
413 pcmpistri $0x3a,%xmm0, %xmm0
414 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
416 jae LABEL(nibble_ashr_exit_use)
419 ja LABEL(nibble_ashr_1_restart_use)
421 jmp LABEL(nibble_ashr_exit_use)
424 * The following cases will be handled by ashr_2
425 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
426 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
431 TOLOWER (%xmm1, %xmm2)
432 pcmpeqb %xmm1, D(%xmm2)
433 psubb %xmm0, D(%xmm2)
438 jnz LABEL(less32bytes)
440 UPDATE_STRNCMP_COUNTER
442 mov $16, %rcx /* index for loads */
443 mov $2, %r9d /* byte position left over from less32bytes case */
445 * Setup %r10 value allows us to detect crossing a page boundary.
446 * When %r10 goes positive we have crossed a page boundary and
447 * need to do a nibble.
450 and $0xfff, %r10 /* offset into 4K page */
451 sub $0x1000, %r10 /* subtract 4K pagesize */
452 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
455 LABEL(loop_ashr_2_use):
457 jg LABEL(nibble_ashr_2_use)
459 LABEL(nibble_ashr_2_restart_use):
460 movdqa (%rdi, %rdx), %xmm0
461 palignr $2, -16(%rdi, %rdx), D(%xmm0)
462 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
463 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
465 movdqa (%rsi,%rdx), %xmm1
466 TOLOWER (%xmm0, %xmm1)
467 pcmpistri $0x1a, %xmm1, %xmm0
470 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
472 jbe LABEL(strcmp_exitz)
477 jg LABEL(nibble_ashr_2_use)
479 movdqa (%rdi, %rdx), %xmm0
480 palignr $2, -16(%rdi, %rdx), D(%xmm0)
481 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
482 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
484 movdqa (%rsi,%rdx), %xmm1
485 TOLOWER (%xmm0, %xmm1)
486 pcmpistri $0x1a, %xmm1, %xmm0
489 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
491 jbe LABEL(strcmp_exitz)
494 jmp LABEL(loop_ashr_2_use)
497 LABEL(nibble_ashr_2_use):
499 movdqa -16(%rdi, %rdx), %xmm0
501 pcmpistri $0x3a,%xmm0, %xmm0
502 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
504 jae LABEL(nibble_ashr_exit_use)
507 ja LABEL(nibble_ashr_2_restart_use)
509 jmp LABEL(nibble_ashr_exit_use)
512 * The following cases will be handled by ashr_3
513 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
514 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
519 TOLOWER (%xmm1, %xmm2)
520 pcmpeqb %xmm1, D(%xmm2)
521 psubb %xmm0, D(%xmm2)
526 jnz LABEL(less32bytes)
529 UPDATE_STRNCMP_COUNTER
531 mov $16, %rcx /* index for loads */
532 mov $3, %r9d /* byte position left over from less32bytes case */
534 * Setup %r10 value allows us to detect crossing a page boundary.
535 * When %r10 goes positive we have crossed a page boundary and
536 * need to do a nibble.
539 and $0xfff, %r10 /* offset into 4K page */
540 sub $0x1000, %r10 /* subtract 4K pagesize */
541 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
543 LABEL(loop_ashr_3_use):
545 jg LABEL(nibble_ashr_3_use)
547 LABEL(nibble_ashr_3_restart_use):
548 movdqa (%rdi, %rdx), %xmm0
549 palignr $3, -16(%rdi, %rdx), D(%xmm0)
550 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
551 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
553 movdqa (%rsi,%rdx), %xmm1
554 TOLOWER (%xmm0, %xmm1)
555 pcmpistri $0x1a, %xmm1, %xmm0
558 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
560 jbe LABEL(strcmp_exitz)
565 jg LABEL(nibble_ashr_3_use)
567 movdqa (%rdi, %rdx), %xmm0
568 palignr $3, -16(%rdi, %rdx), D(%xmm0)
569 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
570 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
572 movdqa (%rsi,%rdx), %xmm1
573 TOLOWER (%xmm0, %xmm1)
574 pcmpistri $0x1a, %xmm1, %xmm0
577 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
579 jbe LABEL(strcmp_exitz)
582 jmp LABEL(loop_ashr_3_use)
585 LABEL(nibble_ashr_3_use):
587 movdqa -16(%rdi, %rdx), %xmm0
589 pcmpistri $0x3a,%xmm0, %xmm0
590 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
592 jae LABEL(nibble_ashr_exit_use)
595 ja LABEL(nibble_ashr_3_restart_use)
597 jmp LABEL(nibble_ashr_exit_use)
600 * The following cases will be handled by ashr_4
601 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
602 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
607 TOLOWER (%xmm1, %xmm2)
608 pcmpeqb %xmm1, D(%xmm2)
609 psubb %xmm0, D(%xmm2)
614 jnz LABEL(less32bytes)
617 UPDATE_STRNCMP_COUNTER
619 mov $16, %rcx /* index for loads */
620 mov $4, %r9d /* byte position left over from less32bytes case */
622 * Setup %r10 value allows us to detect crossing a page boundary.
623 * When %r10 goes positive we have crossed a page boundary and
624 * need to do a nibble.
627 and $0xfff, %r10 /* offset into 4K page */
628 sub $0x1000, %r10 /* subtract 4K pagesize */
629 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
632 LABEL(loop_ashr_4_use):
634 jg LABEL(nibble_ashr_4_use)
636 LABEL(nibble_ashr_4_restart_use):
637 movdqa (%rdi, %rdx), %xmm0
638 palignr $4, -16(%rdi, %rdx), D(%xmm0)
639 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
640 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
642 movdqa (%rsi,%rdx), %xmm1
643 TOLOWER (%xmm0, %xmm1)
644 pcmpistri $0x1a, %xmm1, %xmm0
647 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
649 jbe LABEL(strcmp_exitz)
654 jg LABEL(nibble_ashr_4_use)
656 movdqa (%rdi, %rdx), %xmm0
657 palignr $4, -16(%rdi, %rdx), D(%xmm0)
658 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
659 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
661 movdqa (%rsi,%rdx), %xmm1
662 TOLOWER (%xmm0, %xmm1)
663 pcmpistri $0x1a, %xmm1, %xmm0
666 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
668 jbe LABEL(strcmp_exitz)
671 jmp LABEL(loop_ashr_4_use)
674 LABEL(nibble_ashr_4_use):
676 movdqa -16(%rdi, %rdx), %xmm0
678 pcmpistri $0x3a,%xmm0, %xmm0
679 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
681 jae LABEL(nibble_ashr_exit_use)
684 ja LABEL(nibble_ashr_4_restart_use)
686 jmp LABEL(nibble_ashr_exit_use)
689 * The following cases will be handled by ashr_5
690 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
691 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
696 TOLOWER (%xmm1, %xmm2)
697 pcmpeqb %xmm1, D(%xmm2)
698 psubb %xmm0, D(%xmm2)
703 jnz LABEL(less32bytes)
706 UPDATE_STRNCMP_COUNTER
708 mov $16, %rcx /* index for loads */
709 mov $5, %r9d /* byte position left over from less32bytes case */
711 * Setup %r10 value allows us to detect crossing a page boundary.
712 * When %r10 goes positive we have crossed a page boundary and
713 * need to do a nibble.
716 and $0xfff, %r10 /* offset into 4K page */
717 sub $0x1000, %r10 /* subtract 4K pagesize */
718 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
721 LABEL(loop_ashr_5_use):
723 jg LABEL(nibble_ashr_5_use)
725 LABEL(nibble_ashr_5_restart_use):
726 movdqa (%rdi, %rdx), %xmm0
727 palignr $5, -16(%rdi, %rdx), D(%xmm0)
728 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
729 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
731 movdqa (%rsi,%rdx), %xmm1
732 TOLOWER (%xmm0, %xmm1)
733 pcmpistri $0x1a, %xmm1, %xmm0
736 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
738 jbe LABEL(strcmp_exitz)
743 jg LABEL(nibble_ashr_5_use)
745 movdqa (%rdi, %rdx), %xmm0
747 palignr $5, -16(%rdi, %rdx), D(%xmm0)
748 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
749 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
751 movdqa (%rsi,%rdx), %xmm1
752 TOLOWER (%xmm0, %xmm1)
753 pcmpistri $0x1a, %xmm1, %xmm0
756 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
758 jbe LABEL(strcmp_exitz)
761 jmp LABEL(loop_ashr_5_use)
764 LABEL(nibble_ashr_5_use):
766 movdqa -16(%rdi, %rdx), %xmm0
768 pcmpistri $0x3a,%xmm0, %xmm0
769 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
771 jae LABEL(nibble_ashr_exit_use)
774 ja LABEL(nibble_ashr_5_restart_use)
776 jmp LABEL(nibble_ashr_exit_use)
779 * The following cases will be handled by ashr_6
780 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
781 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
786 TOLOWER (%xmm1, %xmm2)
787 pcmpeqb %xmm1, D(%xmm2)
788 psubb %xmm0, D(%xmm2)
793 jnz LABEL(less32bytes)
796 UPDATE_STRNCMP_COUNTER
798 mov $16, %rcx /* index for loads */
799 mov $6, %r9d /* byte position left over from less32bytes case */
801 * Setup %r10 value allows us to detect crossing a page boundary.
802 * When %r10 goes positive we have crossed a page boundary and
803 * need to do a nibble.
806 and $0xfff, %r10 /* offset into 4K page */
807 sub $0x1000, %r10 /* subtract 4K pagesize */
808 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
811 LABEL(loop_ashr_6_use):
813 jg LABEL(nibble_ashr_6_use)
815 LABEL(nibble_ashr_6_restart_use):
816 movdqa (%rdi, %rdx), %xmm0
817 palignr $6, -16(%rdi, %rdx), D(%xmm0)
818 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
819 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
821 movdqa (%rsi,%rdx), %xmm1
822 TOLOWER (%xmm0, %xmm1)
823 pcmpistri $0x1a, %xmm1, %xmm0
826 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
828 jbe LABEL(strcmp_exitz)
833 jg LABEL(nibble_ashr_6_use)
835 movdqa (%rdi, %rdx), %xmm0
836 palignr $6, -16(%rdi, %rdx), D(%xmm0)
837 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
838 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
840 movdqa (%rsi,%rdx), %xmm1
841 TOLOWER (%xmm0, %xmm1)
842 pcmpistri $0x1a, %xmm1, %xmm0
845 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
847 jbe LABEL(strcmp_exitz)
850 jmp LABEL(loop_ashr_6_use)
853 LABEL(nibble_ashr_6_use):
855 movdqa -16(%rdi, %rdx), %xmm0
857 pcmpistri $0x3a,%xmm0, %xmm0
858 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
860 jae LABEL(nibble_ashr_exit_use)
863 ja LABEL(nibble_ashr_6_restart_use)
865 jmp LABEL(nibble_ashr_exit_use)
868 * The following cases will be handled by ashr_7
869 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
870 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
875 TOLOWER (%xmm1, %xmm2)
876 pcmpeqb %xmm1, D(%xmm2)
877 psubb %xmm0, D(%xmm2)
882 jnz LABEL(less32bytes)
885 UPDATE_STRNCMP_COUNTER
887 mov $16, %rcx /* index for loads */
888 mov $7, %r9d /* byte position left over from less32bytes case */
890 * Setup %r10 value allows us to detect crossing a page boundary.
891 * When %r10 goes positive we have crossed a page boundary and
892 * need to do a nibble.
895 and $0xfff, %r10 /* offset into 4K page */
896 sub $0x1000, %r10 /* subtract 4K pagesize */
897 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
900 LABEL(loop_ashr_7_use):
902 jg LABEL(nibble_ashr_7_use)
904 LABEL(nibble_ashr_7_restart_use):
905 movdqa (%rdi, %rdx), %xmm0
906 palignr $7, -16(%rdi, %rdx), D(%xmm0)
907 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
908 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
910 movdqa (%rsi,%rdx), %xmm1
911 TOLOWER (%xmm0, %xmm1)
912 pcmpistri $0x1a, %xmm1, %xmm0
915 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
917 jbe LABEL(strcmp_exitz)
922 jg LABEL(nibble_ashr_7_use)
924 movdqa (%rdi, %rdx), %xmm0
925 palignr $7, -16(%rdi, %rdx), D(%xmm0)
926 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
927 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
929 movdqa (%rsi,%rdx), %xmm1
930 TOLOWER (%xmm0, %xmm1)
931 pcmpistri $0x1a, %xmm1, %xmm0
934 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
936 jbe LABEL(strcmp_exitz)
939 jmp LABEL(loop_ashr_7_use)
942 LABEL(nibble_ashr_7_use):
944 movdqa -16(%rdi, %rdx), %xmm0
946 pcmpistri $0x3a,%xmm0, %xmm0
947 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
949 jae LABEL(nibble_ashr_exit_use)
952 ja LABEL(nibble_ashr_7_restart_use)
954 jmp LABEL(nibble_ashr_exit_use)
957 * The following cases will be handled by ashr_8
958 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
959 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
964 TOLOWER (%xmm1, %xmm2)
965 pcmpeqb %xmm1, D(%xmm2)
966 psubb %xmm0, D(%xmm2)
971 jnz LABEL(less32bytes)
974 UPDATE_STRNCMP_COUNTER
976 mov $16, %rcx /* index for loads */
977 mov $8, %r9d /* byte position left over from less32bytes case */
979 * Setup %r10 value allows us to detect crossing a page boundary.
980 * When %r10 goes positive we have crossed a page boundary and
981 * need to do a nibble.
984 and $0xfff, %r10 /* offset into 4K page */
985 sub $0x1000, %r10 /* subtract 4K pagesize */
986 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
989 LABEL(loop_ashr_8_use):
991 jg LABEL(nibble_ashr_8_use)
993 LABEL(nibble_ashr_8_restart_use):
994 movdqa (%rdi, %rdx), %xmm0
995 palignr $8, -16(%rdi, %rdx), D(%xmm0)
996 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
997 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
999 movdqa (%rsi,%rdx), %xmm1
1000 TOLOWER (%xmm0, %xmm1)
1001 pcmpistri $0x1a, %xmm1, %xmm0
1004 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1006 jbe LABEL(strcmp_exitz)
1011 jg LABEL(nibble_ashr_8_use)
1013 movdqa (%rdi, %rdx), %xmm0
1014 palignr $8, -16(%rdi, %rdx), D(%xmm0)
1015 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1016 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1018 movdqa (%rsi,%rdx), %xmm1
1019 TOLOWER (%xmm0, %xmm1)
1020 pcmpistri $0x1a, %xmm1, %xmm0
1023 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1025 jbe LABEL(strcmp_exitz)
1028 jmp LABEL(loop_ashr_8_use)
1031 LABEL(nibble_ashr_8_use):
1033 movdqa -16(%rdi, %rdx), %xmm0
1035 pcmpistri $0x3a,%xmm0, %xmm0
1036 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1038 jae LABEL(nibble_ashr_exit_use)
1041 ja LABEL(nibble_ashr_8_restart_use)
1043 jmp LABEL(nibble_ashr_exit_use)
1046 * The following cases will be handled by ashr_9
1047 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1048 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
1053 TOLOWER (%xmm1, %xmm2)
1054 pcmpeqb %xmm1, D(%xmm2)
1055 psubb %xmm0, D(%xmm2)
1056 pmovmskb %xmm2, %r9d
1060 jnz LABEL(less32bytes)
1061 movdqa (%rdi), %xmm3
1063 UPDATE_STRNCMP_COUNTER
1065 mov $16, %rcx /* index for loads */
1066 mov $9, %r9d /* byte position left over from less32bytes case */
1068 * Setup %r10 value allows us to detect crossing a page boundary.
1069 * When %r10 goes positive we have crossed a page boundary and
1070 * need to do a nibble.
1073 and $0xfff, %r10 /* offset into 4K page */
1074 sub $0x1000, %r10 /* subtract 4K pagesize */
1075 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1078 LABEL(loop_ashr_9_use):
1080 jg LABEL(nibble_ashr_9_use)
1082 LABEL(nibble_ashr_9_restart_use):
1083 movdqa (%rdi, %rdx), %xmm0
1085 palignr $9, -16(%rdi, %rdx), D(%xmm0)
1086 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1087 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1089 movdqa (%rsi,%rdx), %xmm1
1090 TOLOWER (%xmm0, %xmm1)
1091 pcmpistri $0x1a, %xmm1, %xmm0
1094 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1096 jbe LABEL(strcmp_exitz)
1101 jg LABEL(nibble_ashr_9_use)
1103 movdqa (%rdi, %rdx), %xmm0
1104 palignr $9, -16(%rdi, %rdx), D(%xmm0)
1105 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1106 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1108 movdqa (%rsi,%rdx), %xmm1
1109 TOLOWER (%xmm0, %xmm1)
1110 pcmpistri $0x1a, %xmm1, %xmm0
1113 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1115 jbe LABEL(strcmp_exitz)
1118 jmp LABEL(loop_ashr_9_use)
1121 LABEL(nibble_ashr_9_use):
1123 movdqa -16(%rdi, %rdx), %xmm0
1125 pcmpistri $0x3a,%xmm0, %xmm0
1126 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1128 jae LABEL(nibble_ashr_exit_use)
1131 ja LABEL(nibble_ashr_9_restart_use)
1133 jmp LABEL(nibble_ashr_exit_use)
1136 * The following cases will be handled by ashr_10
1137 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1138 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
1143 TOLOWER (%xmm1, %xmm2)
1144 pcmpeqb %xmm1, D(%xmm2)
1145 psubb %xmm0, D(%xmm2)
1146 pmovmskb %xmm2, %r9d
1150 jnz LABEL(less32bytes)
1151 movdqa (%rdi), %xmm3
1153 UPDATE_STRNCMP_COUNTER
1155 mov $16, %rcx /* index for loads */
1156 mov $10, %r9d /* byte position left over from less32bytes case */
1158 * Setup %r10 value allows us to detect crossing a page boundary.
1159 * When %r10 goes positive we have crossed a page boundary and
1160 * need to do a nibble.
1163 and $0xfff, %r10 /* offset into 4K page */
1164 sub $0x1000, %r10 /* subtract 4K pagesize */
1165 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1168 LABEL(loop_ashr_10_use):
1170 jg LABEL(nibble_ashr_10_use)
1172 LABEL(nibble_ashr_10_restart_use):
1173 movdqa (%rdi, %rdx), %xmm0
1174 palignr $10, -16(%rdi, %rdx), D(%xmm0)
1175 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1176 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1178 movdqa (%rsi,%rdx), %xmm1
1179 TOLOWER (%xmm0, %xmm1)
1180 pcmpistri $0x1a, %xmm1, %xmm0
1183 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1185 jbe LABEL(strcmp_exitz)
1190 jg LABEL(nibble_ashr_10_use)
1192 movdqa (%rdi, %rdx), %xmm0
1193 palignr $10, -16(%rdi, %rdx), D(%xmm0)
1194 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1195 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1197 movdqa (%rsi,%rdx), %xmm1
1198 TOLOWER (%xmm0, %xmm1)
1199 pcmpistri $0x1a, %xmm1, %xmm0
1202 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1204 jbe LABEL(strcmp_exitz)
1207 jmp LABEL(loop_ashr_10_use)
1210 LABEL(nibble_ashr_10_use):
1212 movdqa -16(%rdi, %rdx), %xmm0
1213 psrldq $10, D(%xmm0)
1214 pcmpistri $0x3a,%xmm0, %xmm0
1215 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1217 jae LABEL(nibble_ashr_exit_use)
1220 ja LABEL(nibble_ashr_10_restart_use)
1222 jmp LABEL(nibble_ashr_exit_use)
1225 * The following cases will be handled by ashr_11
1226 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1227 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
1232 TOLOWER (%xmm1, %xmm2)
1233 pcmpeqb %xmm1, D(%xmm2)
1234 psubb %xmm0, D(%xmm2)
1235 pmovmskb %xmm2, %r9d
1239 jnz LABEL(less32bytes)
1240 movdqa (%rdi), %xmm3
1242 UPDATE_STRNCMP_COUNTER
1244 mov $16, %rcx /* index for loads */
1245 mov $11, %r9d /* byte position left over from less32bytes case */
1247 * Setup %r10 value allows us to detect crossing a page boundary.
1248 * When %r10 goes positive we have crossed a page boundary and
1249 * need to do a nibble.
1252 and $0xfff, %r10 /* offset into 4K page */
1253 sub $0x1000, %r10 /* subtract 4K pagesize */
1254 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1257 LABEL(loop_ashr_11_use):
1259 jg LABEL(nibble_ashr_11_use)
1261 LABEL(nibble_ashr_11_restart_use):
1262 movdqa (%rdi, %rdx), %xmm0
1263 palignr $11, -16(%rdi, %rdx), D(%xmm0)
1264 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1265 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1267 movdqa (%rsi,%rdx), %xmm1
1268 TOLOWER (%xmm0, %xmm1)
1269 pcmpistri $0x1a, %xmm1, %xmm0
1272 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1274 jbe LABEL(strcmp_exitz)
1279 jg LABEL(nibble_ashr_11_use)
1281 movdqa (%rdi, %rdx), %xmm0
1282 palignr $11, -16(%rdi, %rdx), D(%xmm0)
1283 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1284 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1286 movdqa (%rsi,%rdx), %xmm1
1287 TOLOWER (%xmm0, %xmm1)
1288 pcmpistri $0x1a, %xmm1, %xmm0
1291 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1293 jbe LABEL(strcmp_exitz)
1296 jmp LABEL(loop_ashr_11_use)
1299 LABEL(nibble_ashr_11_use):
1301 movdqa -16(%rdi, %rdx), %xmm0
1302 psrldq $11, D(%xmm0)
1303 pcmpistri $0x3a,%xmm0, %xmm0
1304 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1306 jae LABEL(nibble_ashr_exit_use)
1309 ja LABEL(nibble_ashr_11_restart_use)
1311 jmp LABEL(nibble_ashr_exit_use)
1314 * The following cases will be handled by ashr_12
1315 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1316 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
1321 TOLOWER (%xmm1, %xmm2)
1322 pcmpeqb %xmm1, D(%xmm2)
1323 psubb %xmm0, D(%xmm2)
1324 pmovmskb %xmm2, %r9d
1328 jnz LABEL(less32bytes)
1329 movdqa (%rdi), %xmm3
1331 UPDATE_STRNCMP_COUNTER
1333 mov $16, %rcx /* index for loads */
1334 mov $12, %r9d /* byte position left over from less32bytes case */
1336 * Setup %r10 value allows us to detect crossing a page boundary.
1337 * When %r10 goes positive we have crossed a page boundary and
1338 * need to do a nibble.
1341 and $0xfff, %r10 /* offset into 4K page */
1342 sub $0x1000, %r10 /* subtract 4K pagesize */
1343 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1346 LABEL(loop_ashr_12_use):
1348 jg LABEL(nibble_ashr_12_use)
1350 LABEL(nibble_ashr_12_restart_use):
1351 movdqa (%rdi, %rdx), %xmm0
1352 palignr $12, -16(%rdi, %rdx), D(%xmm0)
1353 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1354 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1356 movdqa (%rsi,%rdx), %xmm1
1357 TOLOWER (%xmm0, %xmm1)
1358 pcmpistri $0x1a, %xmm1, %xmm0
1361 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1363 jbe LABEL(strcmp_exitz)
1368 jg LABEL(nibble_ashr_12_use)
1370 movdqa (%rdi, %rdx), %xmm0
1371 palignr $12, -16(%rdi, %rdx), D(%xmm0)
1372 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1373 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1375 movdqa (%rsi,%rdx), %xmm1
1376 TOLOWER (%xmm0, %xmm1)
1377 pcmpistri $0x1a, %xmm1, %xmm0
1380 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1382 jbe LABEL(strcmp_exitz)
1385 jmp LABEL(loop_ashr_12_use)
1388 LABEL(nibble_ashr_12_use):
1390 movdqa -16(%rdi, %rdx), %xmm0
1391 psrldq $12, D(%xmm0)
1392 pcmpistri $0x3a,%xmm0, %xmm0
1393 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1395 jae LABEL(nibble_ashr_exit_use)
1398 ja LABEL(nibble_ashr_12_restart_use)
1400 jmp LABEL(nibble_ashr_exit_use)
1403 * The following cases will be handled by ashr_13
1404 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1405 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
1410 TOLOWER (%xmm1, %xmm2)
1411 pcmpeqb %xmm1, D(%xmm2)
1412 psubb %xmm0, D(%xmm2)
1413 pmovmskb %xmm2, %r9d
1417 jnz LABEL(less32bytes)
1418 movdqa (%rdi), %xmm3
1420 UPDATE_STRNCMP_COUNTER
1422 mov $16, %rcx /* index for loads */
1423 mov $13, %r9d /* byte position left over from less32bytes case */
1425 * Setup %r10 value allows us to detect crossing a page boundary.
1426 * When %r10 goes positive we have crossed a page boundary and
1427 * need to do a nibble.
1430 and $0xfff, %r10 /* offset into 4K page */
1431 sub $0x1000, %r10 /* subtract 4K pagesize */
1433 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1436 LABEL(loop_ashr_13_use):
1438 jg LABEL(nibble_ashr_13_use)
1440 LABEL(nibble_ashr_13_restart_use):
1441 movdqa (%rdi, %rdx), %xmm0
1442 palignr $13, -16(%rdi, %rdx), D(%xmm0)
1443 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1444 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1446 movdqa (%rsi,%rdx), %xmm1
1447 TOLOWER (%xmm0, %xmm1)
1448 pcmpistri $0x1a, %xmm1, %xmm0
1451 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1453 jbe LABEL(strcmp_exitz)
1458 jg LABEL(nibble_ashr_13_use)
1460 movdqa (%rdi, %rdx), %xmm0
1461 palignr $13, -16(%rdi, %rdx), D(%xmm0)
1462 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1463 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1465 movdqa (%rsi,%rdx), %xmm1
1466 TOLOWER (%xmm0, %xmm1)
1467 pcmpistri $0x1a, %xmm1, %xmm0
1470 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1472 jbe LABEL(strcmp_exitz)
1475 jmp LABEL(loop_ashr_13_use)
1478 LABEL(nibble_ashr_13_use):
1480 movdqa -16(%rdi, %rdx), %xmm0
1481 psrldq $13, D(%xmm0)
1482 pcmpistri $0x3a,%xmm0, %xmm0
1483 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1485 jae LABEL(nibble_ashr_exit_use)
1488 ja LABEL(nibble_ashr_13_restart_use)
1490 jmp LABEL(nibble_ashr_exit_use)
1493 * The following cases will be handled by ashr_14
1494 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1495 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
1500 TOLOWER (%xmm1, %xmm2)
1501 pcmpeqb %xmm1, D(%xmm2)
1502 psubb %xmm0, D(%xmm2)
1503 pmovmskb %xmm2, %r9d
1507 jnz LABEL(less32bytes)
1508 movdqa (%rdi), %xmm3
1510 UPDATE_STRNCMP_COUNTER
1512 mov $16, %rcx /* index for loads */
1513 mov $14, %r9d /* byte position left over from less32bytes case */
1515 * Setup %r10 value allows us to detect crossing a page boundary.
1516 * When %r10 goes positive we have crossed a page boundary and
1517 * need to do a nibble.
1520 and $0xfff, %r10 /* offset into 4K page */
1521 sub $0x1000, %r10 /* subtract 4K pagesize */
1523 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1526 LABEL(loop_ashr_14_use):
1528 jg LABEL(nibble_ashr_14_use)
1530 LABEL(nibble_ashr_14_restart_use):
1531 movdqa (%rdi, %rdx), %xmm0
1532 palignr $14, -16(%rdi, %rdx), D(%xmm0)
1533 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1534 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1536 movdqa (%rsi,%rdx), %xmm1
1537 TOLOWER (%xmm0, %xmm1)
1538 pcmpistri $0x1a, %xmm1, %xmm0
1541 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1543 jbe LABEL(strcmp_exitz)
1548 jg LABEL(nibble_ashr_14_use)
1550 movdqa (%rdi, %rdx), %xmm0
1551 palignr $14, -16(%rdi, %rdx), D(%xmm0)
1552 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1553 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1555 movdqa (%rsi,%rdx), %xmm1
1556 TOLOWER (%xmm0, %xmm1)
1557 pcmpistri $0x1a, %xmm1, %xmm0
1560 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1562 jbe LABEL(strcmp_exitz)
1565 jmp LABEL(loop_ashr_14_use)
1568 LABEL(nibble_ashr_14_use):
1570 movdqa -16(%rdi, %rdx), %xmm0
1571 psrldq $14, D(%xmm0)
1572 pcmpistri $0x3a,%xmm0, %xmm0
1573 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1575 jae LABEL(nibble_ashr_exit_use)
1578 ja LABEL(nibble_ashr_14_restart_use)
1580 jmp LABEL(nibble_ashr_exit_use)
1583 * The following cases will be handled by ashr_15
1584 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1585 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
1590 TOLOWER (%xmm1, %xmm2)
1591 pcmpeqb %xmm1, D(%xmm2)
1592 psubb %xmm0, D(%xmm2)
1593 pmovmskb %xmm2, %r9d
1597 jnz LABEL(less32bytes)
1599 movdqa (%rdi), %xmm3
1601 UPDATE_STRNCMP_COUNTER
1603 mov $16, %rcx /* index for loads */
1604 mov $15, %r9d /* byte position left over from less32bytes case */
1606 * Setup %r10 value allows us to detect crossing a page boundary.
1607 * When %r10 goes positive we have crossed a page boundary and
1608 * need to do a nibble.
1611 and $0xfff, %r10 /* offset into 4K page */
1613 sub $0x1000, %r10 /* subtract 4K pagesize */
1615 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1618 LABEL(loop_ashr_15_use):
1620 jg LABEL(nibble_ashr_15_use)
1622 LABEL(nibble_ashr_15_restart_use):
1623 movdqa (%rdi, %rdx), %xmm0
1624 palignr $15, -16(%rdi, %rdx), D(%xmm0)
1625 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1626 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1628 movdqa (%rsi,%rdx), %xmm1
1629 TOLOWER (%xmm0, %xmm1)
1630 pcmpistri $0x1a, %xmm1, %xmm0
1633 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1635 jbe LABEL(strcmp_exitz)
1640 jg LABEL(nibble_ashr_15_use)
1642 movdqa (%rdi, %rdx), %xmm0
1643 palignr $15, -16(%rdi, %rdx), D(%xmm0)
1644 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1645 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1647 movdqa (%rsi,%rdx), %xmm1
1648 TOLOWER (%xmm0, %xmm1)
1649 pcmpistri $0x1a, %xmm1, %xmm0
1652 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1654 jbe LABEL(strcmp_exitz)
1657 jmp LABEL(loop_ashr_15_use)
1660 LABEL(nibble_ashr_15_use):
1662 movdqa -16(%rdi, %rdx), %xmm0
1663 psrldq $15, D(%xmm0)
1664 pcmpistri $0x3a,%xmm0, %xmm0
1665 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1667 jae LABEL(nibble_ashr_exit_use)
1670 ja LABEL(nibble_ashr_15_restart_use)
1672 LABEL(nibble_ashr_exit_use):
1673 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1674 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1676 movdqa (%rsi,%rdx), %xmm1
1677 TOLOWER (%xmm0, %xmm1)
1678 pcmpistri $0x1a, %xmm1, %xmm0
1682 jnc LABEL(strcmp_exitz)
1683 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1685 jbe LABEL(strcmp_exitz)
1688 lea -16(%rdi, %r9), %rdi
1689 movzbl (%rdi, %rdx), %eax
1690 movzbl (%rsi, %rdx), %edx
1695 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1696 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
1697 movl (%rcx,%rdx,4), %edx
1698 movl (%rcx,%rax,4), %eax
1705 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
1706 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
1709 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
1714 bsf %rdx, %rdx /* find and store bit index in %rdx */
1716 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1718 jbe LABEL(strcmp_exitz)
1720 movzbl (%rsi, %rdx), %ecx
1721 movzbl (%rdi, %rdx), %eax
1723 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1724 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1725 movl (%rdx,%rcx,4), %ecx
1726 movl (%rdx,%rax,4), %eax
1732 LABEL(strcmp_exitz):
1737 // XXX Same as code above
1742 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1743 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1744 movl (%rdx,%rcx,4), %ecx
1745 movl (%rdx,%rax,4), %eax
1751 .size STRCMP_SSE42, .-STRCMP_SSE42
1758 /* Put all SSE 4.2 functions together. */
1759 .section .rodata.SECTION,"a",@progbits
1761 LABEL(unaligned_table):
1762 .int LABEL(ashr_1) - LABEL(unaligned_table)
1763 .int LABEL(ashr_2) - LABEL(unaligned_table)
1764 .int LABEL(ashr_3) - LABEL(unaligned_table)
1765 .int LABEL(ashr_4) - LABEL(unaligned_table)
1766 .int LABEL(ashr_5) - LABEL(unaligned_table)
1767 .int LABEL(ashr_6) - LABEL(unaligned_table)
1768 .int LABEL(ashr_7) - LABEL(unaligned_table)
1769 .int LABEL(ashr_8) - LABEL(unaligned_table)
1770 .int LABEL(ashr_9) - LABEL(unaligned_table)
1771 .int LABEL(ashr_10) - LABEL(unaligned_table)
1772 .int LABEL(ashr_11) - LABEL(unaligned_table)
1773 .int LABEL(ashr_12) - LABEL(unaligned_table)
1774 .int LABEL(ashr_13) - LABEL(unaligned_table)
1775 .int LABEL(ashr_14) - LABEL(unaligned_table)
1776 .int LABEL(ashr_15) - LABEL(unaligned_table)
1777 .int LABEL(ashr_0) - LABEL(unaligned_table)