FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
simple_idct.c
Go to the documentation of this file.
1 /*
2  * Simple IDCT MMX
3  *
4  * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "libavutil/mem.h"
24 #include "libavutil/x86/asm.h"
25 
26 #include "libavcodec/idctdsp.h"
27 
28 #include "idctdsp.h"
29 #include "simple_idct.h"
30 
31 #if HAVE_INLINE_ASM
32 
33 /*
34 23170.475006
35 22725.260826
36 21406.727617
37 19265.545870
38 16384.000000
39 12872.826198
40 8866.956905
41 4520.335430
42 */
43 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
44 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
47 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
48 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
49 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
50 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
51 
52 #define ROW_SHIFT 11
53 #define COL_SHIFT 20 // 6
54 
55 DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
56 DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
57 
58 DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
59  1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
60 // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
61 // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
62  1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
63  // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
64 // 0, 0, 0, 0,
65 // 0, 0, 0, 0,
66 
67  C4, C4, C4, C4,
68  C4, -C4, C4, -C4,
69 
70  C2, C6, C2, C6,
71  C6, -C2, C6, -C2,
72 
73  C1, C3, C1, C3,
74  C5, C7, C5, C7,
75 
76  C3, -C7, C3, -C7,
77 -C1, -C5, -C1, -C5,
78 
79  C5, -C1, C5, -C1,
80  C7, C3, C7, C3,
81 
82  C7, -C5, C7, -C5,
83  C3, -C1, C3, -C1
84 };
85 
86 static inline void idct(int16_t *block)
87 {
88  LOCAL_ALIGNED_8(int64_t, align_tmp, [16]);
89  int16_t * const temp= (int16_t*)align_tmp;
90 
91  __asm__ volatile(
92 #if 0 //Alternative, simpler variant
93 
94 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
95  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
96  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
97  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
98  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
99  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
100  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
101  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
102  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
103  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
104  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
105  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
106  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
107  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
108  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
109  #rounder ", %%mm4 \n\t"\
110  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
111  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
112  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
113  "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
114  "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
115  #rounder ", %%mm0 \n\t"\
116  "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
117  "paddd %%mm0, %%mm0 \n\t" \
118  "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
119  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
120  "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
121  "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
122  "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
123  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
124  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
125  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
126  "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
127  "psrad $" #shift ", %%mm7 \n\t"\
128  "psrad $" #shift ", %%mm4 \n\t"\
129  "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
130  "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
131  "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
132  "psrad $" #shift ", %%mm1 \n\t"\
133  "psrad $" #shift ", %%mm2 \n\t"\
134  "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
135  "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
136  "movq %%mm7, " #dst " \n\t"\
137  "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
138  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
139  "movq %%mm2, 24+" #dst " \n\t"\
140  "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
141  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
142  "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
143  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
144  "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
145  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
146  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
147  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
148  "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
149  "psrad $" #shift ", %%mm2 \n\t"\
150  "psrad $" #shift ", %%mm0 \n\t"\
151  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
152  "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
153  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
154  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
155  "psrad $" #shift ", %%mm6 \n\t"\
156  "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
157  "movq %%mm2, 8+" #dst " \n\t"\
158  "psrad $" #shift ", %%mm4 \n\t"\
159  "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
160  "movq %%mm4, 16+" #dst " \n\t"\
161 
162 #define COL_IDCT(src0, src4, src1, src5, dst, shift) \
163  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
164  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
165  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
166  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
167  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
168  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
169  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
170  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
171  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
172  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
173  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
174  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
175  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
176  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
177  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
178  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
179  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
180  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
181  "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
182  "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
183  "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
184  "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
185  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
186  "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
187  "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
188  "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
189  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
190  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
191  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
192  "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
193  "psrad $" #shift ", %%mm7 \n\t"\
194  "psrad $" #shift ", %%mm4 \n\t"\
195  "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
196  "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
197  "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
198  "psrad $" #shift ", %%mm0 \n\t"\
199  "psrad $" #shift ", %%mm2 \n\t"\
200  "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
201  "movd %%mm7, " #dst " \n\t"\
202  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
203  "movd %%mm0, 16+" #dst " \n\t"\
204  "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
205  "movd %%mm2, 96+" #dst " \n\t"\
206  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
207  "movd %%mm4, 112+" #dst " \n\t"\
208  "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
209  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
210  "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
211  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
212  "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
213  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
214  "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
215  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
216  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
217  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
218  "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
219  "psrad $" #shift ", %%mm2 \n\t"\
220  "psrad $" #shift ", %%mm5 \n\t"\
221  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
222  "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
223  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
224  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
225  "psrad $" #shift ", %%mm6 \n\t"\
226  "psrad $" #shift ", %%mm4 \n\t"\
227  "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
228  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
229  "movd %%mm2, 32+" #dst " \n\t"\
230  "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
231  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
232  "movd %%mm6, 48+" #dst " \n\t"\
233  "movd %%mm4, 64+" #dst " \n\t"\
234  "movd %%mm5, 80+" #dst " \n\t"\
235 
236 
237 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
238  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
239  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
240  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
241  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
242  "movq "MANGLE(wm1010)", %%mm4 \n\t"\
243  "pand %%mm0, %%mm4 \n\t"\
244  "por %%mm1, %%mm4 \n\t"\
245  "por %%mm2, %%mm4 \n\t"\
246  "por %%mm3, %%mm4 \n\t"\
247  "packssdw %%mm4,%%mm4 \n\t"\
248  "movd %%mm4, %%eax \n\t"\
249  "orl %%eax, %%eax \n\t"\
250  "jz 1f \n\t"\
251  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
252  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
253  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
254  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
255  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
256  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
257  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
258  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
259  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
260  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
261  #rounder ", %%mm4 \n\t"\
262  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
263  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
264  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
265  "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
266  "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
267  #rounder ", %%mm0 \n\t"\
268  "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
269  "paddd %%mm0, %%mm0 \n\t" \
270  "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
271  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
272  "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
273  "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
274  "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
275  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
276  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
277  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
278  "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
279  "psrad $" #shift ", %%mm7 \n\t"\
280  "psrad $" #shift ", %%mm4 \n\t"\
281  "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
282  "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
283  "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
284  "psrad $" #shift ", %%mm1 \n\t"\
285  "psrad $" #shift ", %%mm2 \n\t"\
286  "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
287  "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
288  "movq %%mm7, " #dst " \n\t"\
289  "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
290  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
291  "movq %%mm2, 24+" #dst " \n\t"\
292  "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
293  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
294  "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
295  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
296  "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
297  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
298  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
299  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
300  "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
301  "psrad $" #shift ", %%mm2 \n\t"\
302  "psrad $" #shift ", %%mm0 \n\t"\
303  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
304  "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
305  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
306  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
307  "psrad $" #shift ", %%mm6 \n\t"\
308  "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
309  "movq %%mm2, 8+" #dst " \n\t"\
310  "psrad $" #shift ", %%mm4 \n\t"\
311  "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
312  "movq %%mm4, 16+" #dst " \n\t"\
313  "jmp 2f \n\t"\
314  "1: \n\t"\
315  "pslld $16, %%mm0 \n\t"\
316  "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
317  "psrad $13, %%mm0 \n\t"\
318  "packssdw %%mm0, %%mm0 \n\t"\
319  "movq %%mm0, " #dst " \n\t"\
320  "movq %%mm0, 8+" #dst " \n\t"\
321  "movq %%mm0, 16+" #dst " \n\t"\
322  "movq %%mm0, 24+" #dst " \n\t"\
323  "2: \n\t"
324 
325 
326 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
327 ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
328 /*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
329 ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
330 ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
331 
332 DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
333 DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
334 DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
335 
336 
337 //IDCT( src0, src4, src1, src5, dst, shift)
338 COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
339 COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
340 COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
341 COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
342 
343 #else
344 
345 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
346  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
347  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
348  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
349  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
350  "movq "MANGLE(wm1010)", %%mm4 \n\t"\
351  "pand %%mm0, %%mm4 \n\t"\
352  "por %%mm1, %%mm4 \n\t"\
353  "por %%mm2, %%mm4 \n\t"\
354  "por %%mm3, %%mm4 \n\t"\
355  "packssdw %%mm4,%%mm4 \n\t"\
356  "movd %%mm4, %%eax \n\t"\
357  "orl %%eax, %%eax \n\t"\
358  "jz 1f \n\t"\
359  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
360  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
361  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
362  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
363  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
364  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
365  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
366  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
367  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
368  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
369  #rounder ", %%mm4 \n\t"\
370  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
371  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
372  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
373  "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
374  "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
375  #rounder ", %%mm0 \n\t"\
376  "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
377  "paddd %%mm0, %%mm0 \n\t" \
378  "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
379  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
380  "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
381  "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
382  "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
383  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
384  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
385  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
386  "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
387  "psrad $" #shift ", %%mm7 \n\t"\
388  "psrad $" #shift ", %%mm4 \n\t"\
389  "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
390  "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
391  "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
392  "psrad $" #shift ", %%mm1 \n\t"\
393  "psrad $" #shift ", %%mm2 \n\t"\
394  "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
395  "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
396  "movq %%mm7, " #dst " \n\t"\
397  "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
398  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
399  "movq %%mm2, 24+" #dst " \n\t"\
400  "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
401  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
402  "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
403  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
404  "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
405  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
406  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
407  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
408  "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
409  "psrad $" #shift ", %%mm2 \n\t"\
410  "psrad $" #shift ", %%mm0 \n\t"\
411  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
412  "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
413  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
414  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
415  "psrad $" #shift ", %%mm6 \n\t"\
416  "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
417  "movq %%mm2, 8+" #dst " \n\t"\
418  "psrad $" #shift ", %%mm4 \n\t"\
419  "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
420  "movq %%mm4, 16+" #dst " \n\t"\
421  "jmp 2f \n\t"\
422  "1: \n\t"\
423  "pslld $16, %%mm0 \n\t"\
424  "paddd "MANGLE(d40000)", %%mm0 \n\t"\
425  "psrad $13, %%mm0 \n\t"\
426  "packssdw %%mm0, %%mm0 \n\t"\
427  "movq %%mm0, " #dst " \n\t"\
428  "movq %%mm0, 8+" #dst " \n\t"\
429  "movq %%mm0, 16+" #dst " \n\t"\
430  "movq %%mm0, 24+" #dst " \n\t"\
431  "2: \n\t"
432 
433 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
434  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
435  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
436  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
437  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
438  "movq %%mm0, %%mm4 \n\t"\
439  "por %%mm1, %%mm4 \n\t"\
440  "por %%mm2, %%mm4 \n\t"\
441  "por %%mm3, %%mm4 \n\t"\
442  "packssdw %%mm4,%%mm4 \n\t"\
443  "movd %%mm4, %%eax \n\t"\
444  "orl %%eax, %%eax \n\t"\
445  "jz " #bt " \n\t"\
446  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
447  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
448  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
449  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
450  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
451  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
452  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
453  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
454  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
455  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
456  #rounder ", %%mm4 \n\t"\
457  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
458  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
459  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
460  "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
461  "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
462  #rounder ", %%mm0 \n\t"\
463  "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
464  "paddd %%mm0, %%mm0 \n\t" \
465  "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
466  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
467  "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
468  "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
469  "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
470  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
471  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
472  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
473  "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
474  "psrad $" #shift ", %%mm7 \n\t"\
475  "psrad $" #shift ", %%mm4 \n\t"\
476  "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
477  "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
478  "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
479  "psrad $" #shift ", %%mm1 \n\t"\
480  "psrad $" #shift ", %%mm2 \n\t"\
481  "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
482  "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
483  "movq %%mm7, " #dst " \n\t"\
484  "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
485  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
486  "movq %%mm2, 24+" #dst " \n\t"\
487  "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
488  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
489  "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
490  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
491  "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
492  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
493  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
494  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
495  "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
496  "psrad $" #shift ", %%mm2 \n\t"\
497  "psrad $" #shift ", %%mm0 \n\t"\
498  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
499  "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
500  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
501  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
502  "psrad $" #shift ", %%mm6 \n\t"\
503  "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
504  "movq %%mm2, 8+" #dst " \n\t"\
505  "psrad $" #shift ", %%mm4 \n\t"\
506  "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
507  "movq %%mm4, 16+" #dst " \n\t"\
508 
509 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
510  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
511  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
512  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
513  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
514  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
515  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
516  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
517  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
518  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
519  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
520  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
521  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
522  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
523  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
524  #rounder ", %%mm4 \n\t"\
525  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
526  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
527  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
528  "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
529  "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
530  #rounder ", %%mm0 \n\t"\
531  "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
532  "paddd %%mm0, %%mm0 \n\t" \
533  "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
534  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
535  "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
536  "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
537  "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
538  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
539  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
540  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
541  "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
542  "psrad $" #shift ", %%mm7 \n\t"\
543  "psrad $" #shift ", %%mm4 \n\t"\
544  "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
545  "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
546  "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
547  "psrad $" #shift ", %%mm1 \n\t"\
548  "psrad $" #shift ", %%mm2 \n\t"\
549  "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
550  "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
551  "movq %%mm7, " #dst " \n\t"\
552  "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
553  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
554  "movq %%mm2, 24+" #dst " \n\t"\
555  "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
556  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
557  "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
558  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
559  "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
560  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
561  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
562  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
563  "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
564  "psrad $" #shift ", %%mm2 \n\t"\
565  "psrad $" #shift ", %%mm0 \n\t"\
566  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
567  "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
568  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
569  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
570  "psrad $" #shift ", %%mm6 \n\t"\
571  "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
572  "movq %%mm2, 8+" #dst " \n\t"\
573  "psrad $" #shift ", %%mm4 \n\t"\
574  "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
575  "movq %%mm4, 16+" #dst " \n\t"\
576 
577 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
578 DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
579 Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
580 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
581 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
582 
583 #undef IDCT
584 #define IDCT(src0, src4, src1, src5, dst, shift) \
585  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
586  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
587  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
588  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
589  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
590  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
591  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
592  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
593  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
594  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
595  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
596  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
597  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
598  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
599  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
600  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
601  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
602  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
603  "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
604  "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
605  "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
606  "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
607  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
608  "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
609  "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
610  "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
611  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
612  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
613  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
614  "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
615  "psrad $" #shift ", %%mm7 \n\t"\
616  "psrad $" #shift ", %%mm4 \n\t"\
617  "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
618  "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
619  "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
620  "psrad $" #shift ", %%mm0 \n\t"\
621  "psrad $" #shift ", %%mm2 \n\t"\
622  "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
623  "movd %%mm7, " #dst " \n\t"\
624  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
625  "movd %%mm0, 16+" #dst " \n\t"\
626  "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
627  "movd %%mm2, 96+" #dst " \n\t"\
628  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
629  "movd %%mm4, 112+" #dst " \n\t"\
630  "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
631  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
632  "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
633  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
634  "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
635  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
636  "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
637  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
638  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
639  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
640  "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
641  "psrad $" #shift ", %%mm2 \n\t"\
642  "psrad $" #shift ", %%mm5 \n\t"\
643  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
644  "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
645  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
646  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
647  "psrad $" #shift ", %%mm6 \n\t"\
648  "psrad $" #shift ", %%mm4 \n\t"\
649  "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
650  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
651  "movd %%mm2, 32+" #dst " \n\t"\
652  "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
653  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
654  "movd %%mm6, 48+" #dst " \n\t"\
655  "movd %%mm4, 64+" #dst " \n\t"\
656  "movd %%mm5, 80+" #dst " \n\t"
657 
658 
659 //IDCT( src0, src4, src1, src5, dst, shift)
660 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
661 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
662 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
663 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
664  "jmp 9f \n\t"
665 
666  "# .p2align 4 \n\t"\
667  "4: \n\t"
668 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
669 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
670 
671 #undef IDCT
672 #define IDCT(src0, src4, src1, src5, dst, shift) \
673  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
674  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
675  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
676  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
677  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
678  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
679  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
680  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
681  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
682  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
683  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
684  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
685  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
686  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
687  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
688  "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
689  "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
690  "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
691  "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
692  "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
693  "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
694  "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
695  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
696  "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
697  "psrad $" #shift ", %%mm1 \n\t"\
698  "psrad $" #shift ", %%mm4 \n\t"\
699  "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
700  "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
701  "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
702  "psrad $" #shift ", %%mm0 \n\t"\
703  "psrad $" #shift ", %%mm2 \n\t"\
704  "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
705  "movd %%mm1, " #dst " \n\t"\
706  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
707  "movd %%mm0, 16+" #dst " \n\t"\
708  "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
709  "movd %%mm2, 96+" #dst " \n\t"\
710  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
711  "movd %%mm4, 112+" #dst " \n\t"\
712  "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
713  "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
714  "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
715  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
716  "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
717  "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
718  "psrad $" #shift ", %%mm2 \n\t"\
719  "psrad $" #shift ", %%mm5 \n\t"\
720  "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
721  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
722  "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
723  "psrad $" #shift ", %%mm6 \n\t"\
724  "psrad $" #shift ", %%mm1 \n\t"\
725  "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
726  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
727  "movd %%mm2, 32+" #dst " \n\t"\
728  "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
729  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
730  "movd %%mm6, 48+" #dst " \n\t"\
731  "movd %%mm1, 64+" #dst " \n\t"\
732  "movd %%mm5, 80+" #dst " \n\t"
733 
734 //IDCT( src0, src4, src1, src5, dst, shift)
735 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
736 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
737 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
738 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
739  "jmp 9f \n\t"
740 
741  "# .p2align 4 \n\t"\
742  "6: \n\t"
743 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
744 
745 #undef IDCT
746 #define IDCT(src0, src4, src1, src5, dst, shift) \
747  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
748  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
749  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
750  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
751  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
752  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
753  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
754  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
755  "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
756  "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
757  "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
758  "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
759  "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
760  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
761  "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
762  "psrad $" #shift ", %%mm1 \n\t"\
763  "psrad $" #shift ", %%mm4 \n\t"\
764  "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
765  "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
766  "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
767  "psrad $" #shift ", %%mm0 \n\t"\
768  "psrad $" #shift ", %%mm2 \n\t"\
769  "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
770  "movd %%mm1, " #dst " \n\t"\
771  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
772  "movd %%mm0, 16+" #dst " \n\t"\
773  "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
774  "movd %%mm2, 96+" #dst " \n\t"\
775  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
776  "movd %%mm4, 112+" #dst " \n\t"\
777  "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
778  "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
779  "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
780  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
781  "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
782  "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
783  "psrad $" #shift ", %%mm2 \n\t"\
784  "psrad $" #shift ", %%mm5 \n\t"\
785  "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
786  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
787  "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
788  "psrad $" #shift ", %%mm6 \n\t"\
789  "psrad $" #shift ", %%mm1 \n\t"\
790  "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
791  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
792  "movd %%mm2, 32+" #dst " \n\t"\
793  "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
794  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
795  "movd %%mm6, 48+" #dst " \n\t"\
796  "movd %%mm1, 64+" #dst " \n\t"\
797  "movd %%mm5, 80+" #dst " \n\t"
798 
799 
800 //IDCT( src0, src4, src1, src5, dst, shift)
801 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
802 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
803 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
804 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
805  "jmp 9f \n\t"
806 
807  "# .p2align 4 \n\t"\
808  "2: \n\t"
809 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
810 
811 #undef IDCT
812 #define IDCT(src0, src4, src1, src5, dst, shift) \
813  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
814  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
815  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
816  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
817  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
818  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
819  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
820  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
821  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
822  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
823  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
824  "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
825  "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
826  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
827  "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
828  "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
829  "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
830  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
831  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
832  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
833  "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
834  "psrad $" #shift ", %%mm7 \n\t"\
835  "psrad $" #shift ", %%mm4 \n\t"\
836  "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
837  "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
838  "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
839  "psrad $" #shift ", %%mm0 \n\t"\
840  "psrad $" #shift ", %%mm2 \n\t"\
841  "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
842  "movd %%mm7, " #dst " \n\t"\
843  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
844  "movd %%mm0, 16+" #dst " \n\t"\
845  "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
846  "movd %%mm2, 96+" #dst " \n\t"\
847  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
848  "movd %%mm4, 112+" #dst " \n\t"\
849  "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
850  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
851  "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
852  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
853  "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
854  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
855  "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
856  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
857  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
858  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
859  "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
860  "psrad $" #shift ", %%mm2 \n\t"\
861  "psrad $" #shift ", %%mm5 \n\t"\
862  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
863  "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
864  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
865  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
866  "psrad $" #shift ", %%mm6 \n\t"\
867  "psrad $" #shift ", %%mm4 \n\t"\
868  "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
869  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
870  "movd %%mm2, 32+" #dst " \n\t"\
871  "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
872  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
873  "movd %%mm6, 48+" #dst " \n\t"\
874  "movd %%mm4, 64+" #dst " \n\t"\
875  "movd %%mm5, 80+" #dst " \n\t"
876 
877 //IDCT( src0, src4, src1, src5, dst, shift)
878 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
879 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
880 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
881 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
882  "jmp 9f \n\t"
883 
884  "# .p2align 4 \n\t"\
885  "3: \n\t"
886 #undef IDCT
887 #define IDCT(src0, src4, src1, src5, dst, shift) \
888  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
889  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
890  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
891  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
892  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
893  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
894  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
895  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
896  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
897  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
898  "movq 64(%2), %%mm3 \n\t"\
899  "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
900  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
901  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
902  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
903  "psrad $" #shift ", %%mm7 \n\t"\
904  "psrad $" #shift ", %%mm4 \n\t"\
905  "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
906  "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
907  "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
908  "psrad $" #shift ", %%mm0 \n\t"\
909  "psrad $" #shift ", %%mm1 \n\t"\
910  "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
911  "movd %%mm7, " #dst " \n\t"\
912  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
913  "movd %%mm0, 16+" #dst " \n\t"\
914  "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
915  "movd %%mm1, 96+" #dst " \n\t"\
916  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
917  "movd %%mm4, 112+" #dst " \n\t"\
918  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
919  "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
920  "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
921  "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
922  "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
923  "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
924  "psrad $" #shift ", %%mm1 \n\t"\
925  "psrad $" #shift ", %%mm5 \n\t"\
926  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
927  "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
928  "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
929  "psrad $" #shift ", %%mm6 \n\t"\
930  "psrad $" #shift ", %%mm4 \n\t"\
931  "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
932  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
933  "movd %%mm1, 32+" #dst " \n\t"\
934  "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
935  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
936  "movd %%mm6, 48+" #dst " \n\t"\
937  "movd %%mm4, 64+" #dst " \n\t"\
938  "movd %%mm5, 80+" #dst " \n\t"
939 
940 
941 //IDCT( src0, src4, src1, src5, dst, shift)
942 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
943 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
944 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
945 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
946  "jmp 9f \n\t"
947 
948  "# .p2align 4 \n\t"\
949  "5: \n\t"
950 #undef IDCT
951 #define IDCT(src0, src4, src1, src5, dst, shift) \
952  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
953  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
954  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
955  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
956  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
957  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
958  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
959  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
960  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
961  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
962  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
963  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
964  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
965  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
966  "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
967  "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
968  "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
969  "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
970  "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
971  "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
972  "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
973  "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
974  "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
975  "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
976  "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
977  "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
978  "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
979  "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
980  "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
981  "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
982  "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
983  "psrad $" #shift ", %%mm4 \n\t"\
984  "psrad $" #shift ", %%mm7 \n\t"\
985  "psrad $" #shift ", %%mm3 \n\t"\
986  "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
987  "movq %%mm4, " #dst " \n\t"\
988  "psrad $" #shift ", %%mm0 \n\t"\
989  "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
990  "movq %%mm0, 16+" #dst " \n\t"\
991  "movq %%mm0, 96+" #dst " \n\t"\
992  "movq %%mm4, 112+" #dst " \n\t"\
993  "psrad $" #shift ", %%mm5 \n\t"\
994  "psrad $" #shift ", %%mm6 \n\t"\
995  "psrad $" #shift ", %%mm2 \n\t"\
996  "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
997  "movq %%mm5, 32+" #dst " \n\t"\
998  "psrad $" #shift ", %%mm1 \n\t"\
999  "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1000  "movq %%mm6, 48+" #dst " \n\t"\
1001  "movq %%mm6, 64+" #dst " \n\t"\
1002  "movq %%mm5, 80+" #dst " \n\t"
1003 
1004 
1005 //IDCT( src0, src4, src1, src5, dst, shift)
1006 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1007 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1008 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1009 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1010  "jmp 9f \n\t"
1011 
1012 
1013  "# .p2align 4 \n\t"\
1014  "1: \n\t"
1015 #undef IDCT
1016 #define IDCT(src0, src4, src1, src5, dst, shift) \
1017  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1018  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1019  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1020  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1021  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1022  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1023  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1024  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1025  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1026  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1027  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1028  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1029  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1030  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1031  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1032  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1033  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1034  "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1035  "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1036  "movq 64(%2), %%mm1 \n\t"\
1037  "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1038  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1039  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1040  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1041  "psrad $" #shift ", %%mm7 \n\t"\
1042  "psrad $" #shift ", %%mm4 \n\t"\
1043  "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
1044  "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1045  "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1046  "psrad $" #shift ", %%mm0 \n\t"\
1047  "psrad $" #shift ", %%mm3 \n\t"\
1048  "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1049  "movd %%mm7, " #dst " \n\t"\
1050  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1051  "movd %%mm0, 16+" #dst " \n\t"\
1052  "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1053  "movd %%mm3, 96+" #dst " \n\t"\
1054  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1055  "movd %%mm4, 112+" #dst " \n\t"\
1056  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1057  "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1058  "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1059  "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
1060  "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1061  "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1062  "psrad $" #shift ", %%mm3 \n\t"\
1063  "psrad $" #shift ", %%mm5 \n\t"\
1064  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1065  "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1066  "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1067  "psrad $" #shift ", %%mm6 \n\t"\
1068  "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1069  "movd %%mm3, 32+" #dst " \n\t"\
1070  "psrad $" #shift ", %%mm4 \n\t"\
1071  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1072  "movd %%mm6, 48+" #dst " \n\t"\
1073  "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1074  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1075  "movd %%mm4, 64+" #dst " \n\t"\
1076  "movd %%mm5, 80+" #dst " \n\t"
1077 
1078 
1079 //IDCT( src0, src4, src1, src5, dst, shift)
1080 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1081 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1082 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1083 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1084  "jmp 9f \n\t"
1085 
1086 
1087  "# .p2align 4 \n\t"
1088  "7: \n\t"
1089 #undef IDCT
1090 #define IDCT(src0, src4, src1, src5, dst, shift) \
1091  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1092  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1093  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1094  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1095  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1096  "psrad $" #shift ", %%mm4 \n\t"\
1097  "psrad $" #shift ", %%mm0 \n\t"\
1098  "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1099  "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1100  "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1101  "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1102  "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1103  "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1104  "psrad $" #shift ", %%mm1 \n\t"\
1105  "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
1106  "movq %%mm4, " #dst " \n\t"\
1107  "psrad $" #shift ", %%mm2 \n\t"\
1108  "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
1109  "movq %%mm0, 16+" #dst " \n\t"\
1110  "movq %%mm0, 96+" #dst " \n\t"\
1111  "movq %%mm4, 112+" #dst " \n\t"\
1112  "movq %%mm0, 32+" #dst " \n\t"\
1113  "movq %%mm4, 48+" #dst " \n\t"\
1114  "movq %%mm4, 64+" #dst " \n\t"\
1115  "movq %%mm0, 80+" #dst " \n\t"
1116 
1117 //IDCT( src0, src4, src1, src5, dst, shift)
1118 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1119 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1120 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1121 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1122 
1123 
1124 #endif
1125 
1126 /*
1127 Input
1128  00 40 04 44 20 60 24 64
1129  10 30 14 34 50 70 54 74
1130  01 41 03 43 21 61 23 63
1131  11 31 13 33 51 71 53 73
1132  02 42 06 46 22 62 26 66
1133  12 32 16 36 52 72 56 76
1134  05 45 07 47 25 65 27 67
1135  15 35 17 37 55 75 57 77
1136 
1137 Temp
1138  00 04 10 14 20 24 30 34
1139  40 44 50 54 60 64 70 74
1140  01 03 11 13 21 23 31 33
1141  41 43 51 53 61 63 71 73
1142  02 06 12 16 22 26 32 36
1143  42 46 52 56 62 66 72 76
1144  05 07 15 17 25 27 35 37
1145  45 47 55 57 65 67 75 77
1146 */
1147 
1148 "9: \n\t"
1149  :: "r" (block), "r" (temp), "r" (coeffs)
1150  NAMED_CONSTRAINTS_ADD(wm1010,d40000)
1151  : "%eax"
1152  );
1153 }
1154 
1155 void ff_simple_idct_mmx(int16_t *block)
1156 {
1157  idct(block);
1158 }
1159 
1160 //FIXME merge add/put into the idct
1161 
1162 void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block)
1163 {
1164  idct(block);
1165  ff_put_pixels_clamped(block, dest, line_size);
1166 }
1167 void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block)
1168 {
1169  idct(block);
1170  ff_add_pixels_clamped(block, dest, line_size);
1171 }
1172 
1173 #endif /* HAVE_INLINE_ASM */