1eee7fcb2420682f613eb7f2528a7056f38d7e04
[sfrench/cifs-2.6.git] / include / asm-x86 / xor_64.h
1 /*
2  * Optimized RAID-5 checksumming functions for MMX and SSE.
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2, or (at your option)
7  * any later version.
8  *
9  * You should have received a copy of the GNU General Public License
10  * (for example /usr/src/linux/COPYING); if not, write to the Free
11  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
12  */
13
14
15 /*
16  * Cache avoiding checksumming functions utilizing KNI instructions
17  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
18  */
19
20 /*
21  * Based on
22  * High-speed RAID5 checksumming functions utilizing SSE instructions.
23  * Copyright (C) 1998 Ingo Molnar.
24  */
25
26 /*
27  * x86-64 changes / gcc fixes from Andi Kleen. 
28  * Copyright 2002 Andi Kleen, SuSE Labs.
29  *
30  * This hasn't been optimized for the hammer yet, but there are likely
31  * no advantages to be gotten from x86-64 here anyways.
32  */
33
34 typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
35
36 /* Doesn't use gcc to save the XMM registers, because there is no easy way to 
37    tell it to do a clts before the register saving. */
38 #define XMMS_SAVE do {                          \
39         preempt_disable();                      \
40         asm volatile (                          \
41                 "movq %%cr0,%0          ;\n\t"  \
42                 "clts                   ;\n\t"  \
43                 "movups %%xmm0,(%1)     ;\n\t"  \
44                 "movups %%xmm1,0x10(%1) ;\n\t"  \
45                 "movups %%xmm2,0x20(%1) ;\n\t"  \
46                 "movups %%xmm3,0x30(%1) ;\n\t"  \
47                 : "=&r" (cr0)                   \
48                 : "r" (xmm_save)                \
49                 : "memory");                    \
50 } while(0)
51
52 #define XMMS_RESTORE do {                       \
53         asm volatile (                          \
54                 "sfence                 ;\n\t"  \
55                 "movups (%1),%%xmm0     ;\n\t"  \
56                 "movups 0x10(%1),%%xmm1 ;\n\t"  \
57                 "movups 0x20(%1),%%xmm2 ;\n\t"  \
58                 "movups 0x30(%1),%%xmm3 ;\n\t"  \
59                 "movq   %0,%%cr0        ;\n\t"  \
60                 :                               \
61                 : "r" (cr0), "r" (xmm_save)     \
62                 : "memory");                    \
63         preempt_enable();                       \
64 } while(0)
65
66 #define OFFS(x)         "16*("#x")"
67 #define PF_OFFS(x)      "256+16*("#x")"
68 #define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
69 #define LD(x,y)         "       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"    ;\n"
70 #define ST(x,y)         "       movaps %%xmm"#y",   "OFFS(x)"(%[p1])    ;\n"
71 #define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
72 #define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
73 #define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
74 #define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
75 #define PF5(x)          "       prefetchnta "PF_OFFS(x)"(%[p6])         ;\n"
76 #define XO1(x,y)        "       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"     ;\n"
77 #define XO2(x,y)        "       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"     ;\n"
78 #define XO3(x,y)        "       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"     ;\n"
79 #define XO4(x,y)        "       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"     ;\n"
80 #define XO5(x,y)        "       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"     ;\n"
81
82
83 static void
84 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
85 {
86         unsigned int lines = bytes >> 8;
87         unsigned long cr0;
88         xmm_store_t xmm_save[4];
89
90         XMMS_SAVE;
91
92         asm volatile (
93 #undef BLOCK
94 #define BLOCK(i) \
95                 LD(i,0)                                 \
96                         LD(i+1,1)                       \
97                 PF1(i)                                  \
98                                 PF1(i+2)                \
99                                 LD(i+2,2)               \
100                                         LD(i+3,3)       \
101                 PF0(i+4)                                \
102                                 PF0(i+6)                \
103                 XO1(i,0)                                \
104                         XO1(i+1,1)                      \
105                                 XO1(i+2,2)              \
106                                         XO1(i+3,3)      \
107                 ST(i,0)                                 \
108                         ST(i+1,1)                       \
109                                 ST(i+2,2)               \
110                                         ST(i+3,3)       \
111
112
113                 PF0(0)
114                                 PF0(2)
115
116         " .align 32                     ;\n"
117         " 1:                            ;\n"
118
119                 BLOCK(0)
120                 BLOCK(4)
121                 BLOCK(8)
122                 BLOCK(12)
123
124         "       addq %[inc], %[p1]           ;\n"
125         "       addq %[inc], %[p2]           ;\n"
126                 "               decl %[cnt] ; jnz 1b"
127         : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
128         : [inc] "r" (256UL) 
129         : "memory");
130
131         XMMS_RESTORE;
132 }
133
134 static void
135 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
136           unsigned long *p3)
137 {
138         unsigned int lines = bytes >> 8;
139         xmm_store_t xmm_save[4];
140         unsigned long cr0;
141
142         XMMS_SAVE;
143
144         __asm__ __volatile__ (
145 #undef BLOCK
146 #define BLOCK(i) \
147                 PF1(i)                                  \
148                                 PF1(i+2)                \
149                 LD(i,0)                                 \
150                         LD(i+1,1)                       \
151                                 LD(i+2,2)               \
152                                         LD(i+3,3)       \
153                 PF2(i)                                  \
154                                 PF2(i+2)                \
155                 PF0(i+4)                                \
156                                 PF0(i+6)                \
157                 XO1(i,0)                                \
158                         XO1(i+1,1)                      \
159                                 XO1(i+2,2)              \
160                                         XO1(i+3,3)      \
161                 XO2(i,0)                                \
162                         XO2(i+1,1)                      \
163                                 XO2(i+2,2)              \
164                                         XO2(i+3,3)      \
165                 ST(i,0)                                 \
166                         ST(i+1,1)                       \
167                                 ST(i+2,2)               \
168                                         ST(i+3,3)       \
169
170
171                 PF0(0)
172                                 PF0(2)
173
174         " .align 32                     ;\n"
175         " 1:                            ;\n"
176
177                 BLOCK(0)
178                 BLOCK(4)
179                 BLOCK(8)
180                 BLOCK(12)
181
182         "       addq %[inc], %[p1]           ;\n"
183         "       addq %[inc], %[p2]          ;\n"
184         "       addq %[inc], %[p3]           ;\n"
185                 "               decl %[cnt] ; jnz 1b"
186         : [cnt] "+r" (lines),
187           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
188         : [inc] "r" (256UL)
189         : "memory"); 
190         XMMS_RESTORE;
191 }
192
193 static void
194 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
195           unsigned long *p3, unsigned long *p4)
196 {
197         unsigned int lines = bytes >> 8;
198         xmm_store_t xmm_save[4]; 
199         unsigned long cr0;
200
201         XMMS_SAVE;
202
203         __asm__ __volatile__ (
204 #undef BLOCK
205 #define BLOCK(i) \
206                 PF1(i)                                  \
207                                 PF1(i+2)                \
208                 LD(i,0)                                 \
209                         LD(i+1,1)                       \
210                                 LD(i+2,2)               \
211                                         LD(i+3,3)       \
212                 PF2(i)                                  \
213                                 PF2(i+2)                \
214                 XO1(i,0)                                \
215                         XO1(i+1,1)                      \
216                                 XO1(i+2,2)              \
217                                         XO1(i+3,3)      \
218                 PF3(i)                                  \
219                                 PF3(i+2)                \
220                 PF0(i+4)                                \
221                                 PF0(i+6)                \
222                 XO2(i,0)                                \
223                         XO2(i+1,1)                      \
224                                 XO2(i+2,2)              \
225                                         XO2(i+3,3)      \
226                 XO3(i,0)                                \
227                         XO3(i+1,1)                      \
228                                 XO3(i+2,2)              \
229                                         XO3(i+3,3)      \
230                 ST(i,0)                                 \
231                         ST(i+1,1)                       \
232                                 ST(i+2,2)               \
233                                         ST(i+3,3)       \
234
235
236                 PF0(0)
237                                 PF0(2)
238
239         " .align 32                     ;\n"
240         " 1:                            ;\n"
241
242                 BLOCK(0)
243                 BLOCK(4)
244                 BLOCK(8)
245                 BLOCK(12)
246
247         "       addq %[inc], %[p1]           ;\n"
248         "       addq %[inc], %[p2]           ;\n"
249         "       addq %[inc], %[p3]           ;\n"
250         "       addq %[inc], %[p4]           ;\n"
251         "       decl %[cnt] ; jnz 1b"
252         : [cnt] "+c" (lines),
253           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
254         : [inc] "r" (256UL)
255         : "memory" );
256
257         XMMS_RESTORE;
258 }
259
260 static void
261 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
262           unsigned long *p3, unsigned long *p4, unsigned long *p5)
263 {
264         unsigned int lines = bytes >> 8;
265         xmm_store_t xmm_save[4];
266         unsigned long cr0;
267
268         XMMS_SAVE;
269
270         __asm__ __volatile__ (
271 #undef BLOCK
272 #define BLOCK(i) \
273                 PF1(i)                                  \
274                                 PF1(i+2)                \
275                 LD(i,0)                                 \
276                         LD(i+1,1)                       \
277                                 LD(i+2,2)               \
278                                         LD(i+3,3)       \
279                 PF2(i)                                  \
280                                 PF2(i+2)                \
281                 XO1(i,0)                                \
282                         XO1(i+1,1)                      \
283                                 XO1(i+2,2)              \
284                                         XO1(i+3,3)      \
285                 PF3(i)                                  \
286                                 PF3(i+2)                \
287                 XO2(i,0)                                \
288                         XO2(i+1,1)                      \
289                                 XO2(i+2,2)              \
290                                         XO2(i+3,3)      \
291                 PF4(i)                                  \
292                                 PF4(i+2)                \
293                 PF0(i+4)                                \
294                                 PF0(i+6)                \
295                 XO3(i,0)                                \
296                         XO3(i+1,1)                      \
297                                 XO3(i+2,2)              \
298                                         XO3(i+3,3)      \
299                 XO4(i,0)                                \
300                         XO4(i+1,1)                      \
301                                 XO4(i+2,2)              \
302                                         XO4(i+3,3)      \
303                 ST(i,0)                                 \
304                         ST(i+1,1)                       \
305                                 ST(i+2,2)               \
306                                         ST(i+3,3)       \
307
308
309                 PF0(0)
310                                 PF0(2)
311
312         " .align 32                     ;\n"
313         " 1:                            ;\n"
314
315                 BLOCK(0)
316                 BLOCK(4)
317                 BLOCK(8)
318                 BLOCK(12)
319
320         "       addq %[inc], %[p1]           ;\n"
321         "       addq %[inc], %[p2]           ;\n"
322         "       addq %[inc], %[p3]           ;\n"
323         "       addq %[inc], %[p4]           ;\n"
324         "       addq %[inc], %[p5]           ;\n"
325         "       decl %[cnt] ; jnz 1b"
326         : [cnt] "+c" (lines),
327           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4), 
328           [p5] "+r" (p5)
329         : [inc] "r" (256UL)
330         : "memory");
331
332         XMMS_RESTORE;
333 }
334
335 static struct xor_block_template xor_block_sse = {
336         .name = "generic_sse",
337         .do_2 = xor_sse_2,
338         .do_3 = xor_sse_3,
339         .do_4 = xor_sse_4,
340         .do_5 = xor_sse_5,
341 };
342
343 #undef XOR_TRY_TEMPLATES
344 #define XOR_TRY_TEMPLATES                               \
345         do {                                            \
346                 xor_speed(&xor_block_sse);      \
347         } while (0)
348
349 /* We force the use of the SSE xor block because it can write around L2.
350    We may also be able to load into the L1 only depending on how the cpu
351    deals with a load to a line that is being prefetched.  */
352 #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)