FFmpeg
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Modules Pages
vp8dsp_mmi.c
Go to the documentation of this file.
1 /*
2  * Loongson SIMD optimized vp8dsp
3  *
4  * Copyright (c) 2016 Loongson Technology Corporation Limited
5  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "vp8dsp_mips.h"
25 #include "constants.h"
26 #include "libavutil/attributes.h"
27 #include "libavutil/intfloat.h"
29 #include "libavutil/mem_internal.h"
30 
31 #define DECLARE_DOUBLE_1 double db_1
32 #define DECLARE_DOUBLE_2 double db_2
33 #define DECLARE_UINT32_T uint32_t it_1
34 #define RESTRICT_ASM_DOUBLE_1 [db_1]"=&f"(db_1)
35 #define RESTRICT_ASM_DOUBLE_2 [db_2]"=&f"(db_2)
36 #define RESTRICT_ASM_UINT32_T [it_1]"=&r"(it_1)
37 
38 #define MMI_PCMPGTUB(dst, src1, src2) \
39  "pcmpeqb %[db_1], "#src1", "#src2" \n\t" \
40  "pmaxub %[db_2], "#src1", "#src2" \n\t" \
41  "pcmpeqb %[db_2], %[db_2], "#src1" \n\t" \
42  "pxor "#dst", %[db_2], %[db_1] \n\t"
43 
44 #define MMI_BTOH(dst_l, dst_r, src) \
45  "pxor %[db_1], %[db_1], %[db_1] \n\t" \
46  "pcmpgtb %[db_2], %[db_1], "#src" \n\t" \
47  "punpcklbh "#dst_r", "#src", %[db_2] \n\t" \
48  "punpckhbh "#dst_l", "#src", %[db_2] \n\t"
49 
50 #define MMI_VP8_LOOP_FILTER \
51  /* Calculation of hev */ \
52  "dmtc1 %[thresh], %[ftmp3] \n\t" \
53  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
54  "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
55  "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
56  "pasubub %[ftmp0], %[p1], %[p0] \n\t" \
57  "pasubub %[ftmp1], %[q1], %[q0] \n\t" \
58  "pmaxub %[ftmp0], %[ftmp0], %[ftmp1] \n\t" \
59  MMI_PCMPGTUB(%[hev], %[ftmp0], %[ftmp3]) \
60  /* Calculation of mask */ \
61  "pasubub %[ftmp1], %[p0], %[q0] \n\t" \
62  "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
63  "pasubub %[ftmp2], %[p1], %[q1] \n\t" \
64  "li %[tmp0], 0x09 \n\t" \
65  "dmtc1 %[tmp0], %[ftmp3] \n\t" \
66  PSRLB_MMI(%[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5], %[ftmp2]) \
67  "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
68  "dmtc1 %[e], %[ftmp3] \n\t" \
69  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
70  "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
71  "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
72  MMI_PCMPGTUB(%[mask], %[ftmp1], %[ftmp3]) \
73  "pmaxub %[mask], %[mask], %[ftmp0] \n\t" \
74  "pasubub %[ftmp1], %[p3], %[p2] \n\t" \
75  "pasubub %[ftmp2], %[p2], %[p1] \n\t" \
76  "pmaxub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
77  "pmaxub %[mask], %[mask], %[ftmp1] \n\t" \
78  "pasubub %[ftmp1], %[q3], %[q2] \n\t" \
79  "pasubub %[ftmp2], %[q2], %[q1] \n\t" \
80  "pmaxub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
81  "pmaxub %[mask], %[mask], %[ftmp1] \n\t" \
82  "dmtc1 %[i], %[ftmp3] \n\t" \
83  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
84  "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
85  "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
86  MMI_PCMPGTUB(%[mask], %[mask], %[ftmp3]) \
87  "pcmpeqw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
88  "pxor %[mask], %[mask], %[ftmp3] \n\t" \
89  /* VP8_MBFILTER */ \
90  "li %[tmp0], 0x80808080 \n\t" \
91  "dmtc1 %[tmp0], %[ftmp7] \n\t" \
92  "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \
93  "pxor %[p2], %[p2], %[ftmp7] \n\t" \
94  "pxor %[p1], %[p1], %[ftmp7] \n\t" \
95  "pxor %[p0], %[p0], %[ftmp7] \n\t" \
96  "pxor %[q0], %[q0], %[ftmp7] \n\t" \
97  "pxor %[q1], %[q1], %[ftmp7] \n\t" \
98  "pxor %[q2], %[q2], %[ftmp7] \n\t" \
99  "psubsb %[ftmp4], %[p1], %[q1] \n\t" \
100  "psubb %[ftmp5], %[q0], %[p0] \n\t" \
101  MMI_BTOH(%[ftmp1], %[ftmp0], %[ftmp5]) \
102  MMI_BTOH(%[ftmp3], %[ftmp2], %[ftmp4]) \
103  /* Right part */ \
104  "paddh %[ftmp5], %[ftmp0], %[ftmp0] \n\t" \
105  "paddh %[ftmp0], %[ftmp0], %[ftmp5] \n\t" \
106  "paddh %[ftmp0], %[ftmp2], %[ftmp0] \n\t" \
107  /* Left part */ \
108  "paddh %[ftmp5], %[ftmp1], %[ftmp1] \n\t" \
109  "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" \
110  "paddh %[ftmp1], %[ftmp3], %[ftmp1] \n\t" \
111  /* Combine left and right part */ \
112  "packsshb %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \
113  "pand %[ftmp1], %[ftmp1], %[mask] \n\t" \
114  "pand %[ftmp2], %[ftmp1], %[hev] \n\t" \
115  "li %[tmp0], 0x04040404 \n\t" \
116  "dmtc1 %[tmp0], %[ftmp0] \n\t" \
117  "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
118  "paddsb %[ftmp3], %[ftmp2], %[ftmp0] \n\t" \
119  "li %[tmp0], 0x0B \n\t" \
120  "dmtc1 %[tmp0], %[ftmp4] \n\t" \
121  PSRAB_MMI(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], %[ftmp3]) \
122  "li %[tmp0], 0x03030303 \n\t" \
123  "dmtc1 %[tmp0], %[ftmp0] \n\t" \
124  "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
125  "paddsb %[ftmp4], %[ftmp2], %[ftmp0] \n\t" \
126  "li %[tmp0], 0x0B \n\t" \
127  "dmtc1 %[tmp0], %[ftmp2] \n\t" \
128  PSRAB_MMI(%[ftmp4], %[ftmp2], %[ftmp5], %[ftmp6], %[ftmp4]) \
129  "psubsb %[q0], %[q0], %[ftmp3] \n\t" \
130  "paddsb %[p0], %[p0], %[ftmp4] \n\t" \
131  /* filt_val &= ~hev */ \
132  "pcmpeqw %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
133  "pxor %[hev], %[hev], %[ftmp0] \n\t" \
134  "pand %[ftmp1], %[ftmp1], %[hev] \n\t" \
135  MMI_BTOH(%[ftmp5], %[ftmp6], %[ftmp1]) \
136  "li %[tmp0], 0x07 \n\t" \
137  "dmtc1 %[tmp0], %[ftmp2] \n\t" \
138  "li %[tmp0], 0x001b001b \n\t" \
139  "dmtc1 %[tmp0], %[ftmp1] \n\t" \
140  "punpcklwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
141  "li %[tmp0], 0x003f003f \n\t" \
142  "dmtc1 %[tmp0], %[ftmp0] \n\t" \
143  "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
144  /* Right part */ \
145  "pmullh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
146  "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
147  "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
148  /* Left part */ \
149  "pmullh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
150  "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
151  "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
152  /* Combine left and right part */ \
153  "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
154  "psubsb %[q0], %[q0], %[ftmp4] \n\t" \
155  "pxor %[q0], %[q0], %[ftmp7] \n\t" \
156  "paddsb %[p0], %[p0], %[ftmp4] \n\t" \
157  "pxor %[p0], %[p0], %[ftmp7] \n\t" \
158  "li %[tmp0], 0x00120012 \n\t" \
159  "dmtc1 %[tmp0], %[ftmp1] \n\t" \
160  "punpcklwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
161  /* Right part */ \
162  "pmullh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
163  "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
164  "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
165  /* Left part */ \
166  "pmullh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
167  "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
168  "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
169  /* Combine left and right part */ \
170  "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
171  "psubsb %[q1], %[q1], %[ftmp4] \n\t" \
172  "pxor %[q1], %[q1], %[ftmp7] \n\t" \
173  "paddsb %[p1], %[p1], %[ftmp4] \n\t" \
174  "pxor %[p1], %[p1], %[ftmp7] \n\t" \
175  "li %[tmp0], 0x03 \n\t" \
176  "dmtc1 %[tmp0], %[ftmp1] \n\t" \
177  /* Right part */ \
178  "psllh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
179  "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t" \
180  "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
181  "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
182  /* Left part */ \
183  "psllh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
184  "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
185  "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
186  "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
187  /* Combine left and right part */ \
188  "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
189  "psubsb %[q2], %[q2], %[ftmp4] \n\t" \
190  "pxor %[q2], %[q2], %[ftmp7] \n\t" \
191  "paddsb %[p2], %[p2], %[ftmp4] \n\t" \
192  "pxor %[p2], %[p2], %[ftmp7] \n\t"
193 
194 #define PUT_VP8_EPEL4_H6_MMI(src, dst) \
195  MMI_ULWC1(%[ftmp1], src, 0x00) \
196  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
197  "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
198  \
199  MMI_ULWC1(%[ftmp1], src, -0x01) \
200  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
201  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
202  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
203  \
204  MMI_ULWC1(%[ftmp1], src, -0x02) \
205  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
206  "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
207  "paddsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
208  \
209  MMI_ULWC1(%[ftmp1], src, 0x01) \
210  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
211  "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
212  \
213  MMI_ULWC1(%[ftmp1], src, 0x02) \
214  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
215  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
216  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
217  \
218  MMI_ULWC1(%[ftmp1], src, 0x03) \
219  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
220  "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
221  "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
222  \
223  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
224  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
225  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
226  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
227  \
228  MMI_SWC1(%[ftmp1], dst, 0x00)
229 
230 
231 #define PUT_VP8_EPEL4_H4_MMI(src, dst) \
232  MMI_ULWC1(%[ftmp1], src, 0x00) \
233  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
234  "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
235  \
236  MMI_ULWC1(%[ftmp1], src, -0x01) \
237  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
238  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
239  "psubsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
240  \
241  MMI_ULWC1(%[ftmp1], src, 0x01) \
242  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
243  "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
244  \
245  MMI_ULWC1(%[ftmp1], src, 0x02) \
246  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
247  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
248  "psubh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
249  \
250  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
251  \
252  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
253  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
254  \
255  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
256  MMI_SWC1(%[ftmp1], dst, 0x00)
257 
258 
259 #define PUT_VP8_EPEL4_V6_MMI(src, src1, dst, srcstride) \
260  MMI_ULWC1(%[ftmp1], src, 0x00) \
261  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
262  "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
263  \
264  PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
265  MMI_ULWC1(%[ftmp1], src1, 0x00) \
266  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
267  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
268  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
269  \
270  PTR_SUBU ""#src1", "#src1", "#srcstride" \n\t" \
271  MMI_ULWC1(%[ftmp1], src1, 0x00) \
272  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
273  "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
274  "paddsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
275  \
276  PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
277  MMI_ULWC1(%[ftmp1], src1, 0x00) \
278  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
279  "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
280  \
281  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
282  MMI_ULWC1(%[ftmp1], src1, 0x00) \
283  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
284  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
285  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
286  \
287  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
288  MMI_ULWC1(%[ftmp1], src1, 0x00) \
289  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
290  "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
291  "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
292  \
293  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
294  \
295  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
296  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
297  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
298  \
299  MMI_SWC1(%[ftmp1], dst, 0x00)
300 
301 
302 #define PUT_VP8_EPEL4_V4_MMI(src, src1, dst, srcstride) \
303  MMI_ULWC1(%[ftmp1], src, 0x00) \
304  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
305  "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
306  \
307  PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
308  MMI_ULWC1(%[ftmp1], src1, 0x00) \
309  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
310  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
311  "psubsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
312  \
313  PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
314  MMI_ULWC1(%[ftmp1], src1, 0x00) \
315  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
316  "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
317  \
318  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
319  MMI_ULWC1(%[ftmp1], src1, 0x00) \
320  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
321  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
322  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
323  \
324  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
325  \
326  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
327  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
328  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
329  \
330  MMI_SWC1(%[ftmp1], dst, 0x00)
331 
332 
333 #define PUT_VP8_EPEL8_H6_MMI(src, dst) \
334  MMI_ULDC1(%[ftmp1], src, 0x00) \
335  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
336  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
337  "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
338  "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
339  \
340  MMI_ULDC1(%[ftmp1], src, -0x01) \
341  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
342  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
343  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
344  "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
345  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
346  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
347  \
348  MMI_ULDC1(%[ftmp1], src, -0x02) \
349  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
350  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
351  "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
352  "pmullh %[ftmp3], %[ftmp3], %[filter0] \n\t" \
353  "paddsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
354  "paddsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
355  \
356  MMI_ULDC1(%[ftmp1], src, 0x01) \
357  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
358  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
359  "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
360  "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
361  \
362  MMI_ULDC1(%[ftmp1], src, 0x02) \
363  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
364  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
365  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
366  "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
367  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
368  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
369  \
370  MMI_ULDC1(%[ftmp1], src, 0x03) \
371  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
372  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
373  "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
374  "pmullh %[ftmp3], %[ftmp3], %[filter5] \n\t" \
375  "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
376  "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
377  \
378  "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
379  "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
380  \
381  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
382  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
383  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
384  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
385  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
386  \
387  MMI_SDC1(%[ftmp1], dst, 0x00)
388 
389 
390 #define PUT_VP8_EPEL8_H4_MMI(src, dst) \
391  MMI_ULDC1(%[ftmp1], src, 0x00) \
392  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
393  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
394  "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
395  "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
396  \
397  MMI_ULDC1(%[ftmp1], src, -0x01) \
398  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
399  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
400  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
401  "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
402  "psubsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
403  "psubsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
404  \
405  MMI_ULDC1(%[ftmp1], src, 0x01) \
406  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
407  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
408  "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
409  "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
410  \
411  MMI_ULDC1(%[ftmp1], src, 0x02) \
412  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
413  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
414  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
415  "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
416  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
417  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
418  \
419  "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
420  "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
421  \
422  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
423  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
424  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
425  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
426  \
427  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
428  MMI_SDC1(%[ftmp1], dst, 0x00)
429 
430 
431 #define PUT_VP8_EPEL8_V6_MMI(src, src1, dst, srcstride) \
432  MMI_ULDC1(%[ftmp1], src, 0x00) \
433  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
434  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
435  "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
436  "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
437  \
438  PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
439  MMI_ULDC1(%[ftmp1], src1, 0x00) \
440  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
441  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
442  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
443  "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
444  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
445  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
446  \
447  PTR_SUBU ""#src1", "#src1", "#srcstride" \n\t" \
448  MMI_ULDC1(%[ftmp1], src1, 0x00) \
449  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
450  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
451  "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
452  "pmullh %[ftmp3], %[ftmp3], %[filter0] \n\t" \
453  "paddsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
454  "paddsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
455  \
456  PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
457  MMI_ULDC1(%[ftmp1], src1, 0x00) \
458  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
459  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
460  "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
461  "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
462  \
463  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
464  MMI_ULDC1(%[ftmp1], src1, 0x00) \
465  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
466  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
467  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
468  "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
469  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
470  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
471  \
472  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
473  MMI_ULDC1(%[ftmp1], src1, 0x00) \
474  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
475  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
476  "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
477  "pmullh %[ftmp3], %[ftmp3], %[filter5] \n\t" \
478  "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
479  "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
480  \
481  "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
482  "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
483  \
484  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
485  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
486  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
487  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
488  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
489  \
490  MMI_SDC1(%[ftmp1], dst, 0x00)
491 
492 
493 #define PUT_VP8_EPEL8_V4_MMI(src, src1, dst, srcstride) \
494  MMI_ULDC1(%[ftmp1], src, 0x00) \
495  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
496  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
497  "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
498  "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
499  \
500  PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
501  MMI_ULDC1(%[ftmp1], src1, 0x00) \
502  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
503  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
504  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
505  "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
506  "psubsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
507  "psubsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
508  \
509  PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
510  MMI_ULDC1(%[ftmp1], src1, 0x00) \
511  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
512  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
513  "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
514  "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
515  \
516  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
517  MMI_ULDC1(%[ftmp1], src1, 0x00) \
518  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
519  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
520  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
521  "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
522  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
523  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
524  \
525  "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
526  "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
527  \
528  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
529  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
530  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
531  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
532  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
533  \
534  MMI_SDC1(%[ftmp1], dst, 0x00)
535 
536 
537 #define PUT_VP8_BILINEAR8_H_MMI(src, dst) \
538  MMI_ULDC1(%[ftmp1], src, 0x00) \
539  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
540  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
541  "pmullh %[ftmp5], %[ftmp2], %[a] \n\t" \
542  "pmullh %[ftmp6], %[ftmp3], %[a] \n\t" \
543  \
544  MMI_ULDC1(%[ftmp1], src, 0x01) \
545  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
546  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
547  "pmullh %[ftmp2], %[ftmp2], %[b] \n\t" \
548  "pmullh %[ftmp3], %[ftmp3], %[b] \n\t" \
549  "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
550  "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
551  \
552  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_4] \n\t" \
553  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_4] \n\t" \
554  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
555  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
556  \
557  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
558  MMI_SDC1(%[ftmp1], dst, 0x00)
559 
560 
561 #define PUT_VP8_BILINEAR4_H_MMI(src, dst) \
562  MMI_ULWC1(%[ftmp1], src, 0x00) \
563  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
564  "pmullh %[ftmp3], %[ftmp2], %[a] \n\t" \
565  \
566  MMI_ULWC1(%[ftmp1], src, 0x01) \
567  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
568  "pmullh %[ftmp2], %[ftmp2], %[b] \n\t" \
569  "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
570  \
571  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t" \
572  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
573  \
574  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
575  MMI_SWC1(%[ftmp1], dst, 0x00)
576 
577 
578 #define PUT_VP8_BILINEAR8_V_MMI(src, src1, dst, sstride) \
579  MMI_ULDC1(%[ftmp1], src, 0x00) \
580  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
581  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
582  "pmullh %[ftmp5], %[ftmp2], %[c] \n\t" \
583  "pmullh %[ftmp6], %[ftmp3], %[c] \n\t" \
584  \
585  PTR_ADDU ""#src1", "#src", "#sstride" \n\t" \
586  MMI_ULDC1(%[ftmp1], src1, 0x00) \
587  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
588  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
589  "pmullh %[ftmp2], %[ftmp2], %[d] \n\t" \
590  "pmullh %[ftmp3], %[ftmp3], %[d] \n\t" \
591  "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
592  "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
593  \
594  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_4] \n\t" \
595  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_4] \n\t" \
596  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
597  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
598  \
599  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
600  MMI_SDC1(%[ftmp1], dst, 0x00)
601 
602 
603 #define PUT_VP8_BILINEAR4_V_MMI(src, src1, dst, sstride) \
604  MMI_ULWC1(%[ftmp1], src, 0x00) \
605  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
606  "pmullh %[ftmp3], %[ftmp2], %[c] \n\t" \
607  \
608  PTR_ADDU ""#src1", "#src", "#sstride" \n\t" \
609  MMI_ULWC1(%[ftmp1], src1, 0x00) \
610  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
611  "pmullh %[ftmp2], %[ftmp2], %[d] \n\t" \
612  "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
613  \
614  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t" \
615  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
616  \
617  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
618  MMI_SWC1(%[ftmp1], dst, 0x00)
619 
620 
621 DECLARE_ALIGNED(8, static const uint64_t, fourtap_subpel_filters[7][6]) = {
622  {0x0000000000000000, 0x0006000600060006, 0x007b007b007b007b,
623  0x000c000c000c000c, 0x0001000100010001, 0x0000000000000000},
624 
625  {0x0002000200020002, 0x000b000b000b000b, 0x006c006c006c006c,
626  0x0024002400240024, 0x0008000800080008, 0x0001000100010001},
627 
628  {0x0000000000000000, 0x0009000900090009, 0x005d005d005d005d,
629  0x0032003200320032, 0x0006000600060006, 0x0000000000000000},
630 
631  {0x0003000300030003, 0x0010001000100010, 0x004d004d004d004d,
632  0x004d004d004d004d, 0x0010001000100010, 0x0003000300030003},
633 
634  {0x0000000000000000, 0x0006000600060006, 0x0032003200320032,
635  0x005d005d005d005d, 0x0009000900090009, 0x0000000000000000},
636 
637  {0x0001000100010001, 0x0008000800080008, 0x0024002400240024,
638  0x006c006c006c006c, 0x000b000b000b000b, 0x0002000200020002},
639 
640  {0x0000000000000000, 0x0001000100010001, 0x000c000c000c000c,
641  0x007b007b007b007b, 0x0006000600060006, 0x0000000000000000}
642 };
643 
644 #if 0
645 #define FILTER_6TAP(src, F, stride) \
646  cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \
647  F[0] * src[x - 2 * stride] + F[3] * src[x + 1 * stride] - \
648  F[4] * src[x + 2 * stride] + F[5] * src[x + 3 * stride] + 64) >> 7]
649 
650 #define FILTER_4TAP(src, F, stride) \
651  cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \
652  F[3] * src[x + 1 * stride] - F[4] * src[x + 2 * stride] + 64) >> 7]
653 
654 static const uint8_t subpel_filters[7][6] = {
655  { 0, 6, 123, 12, 1, 0 },
656  { 2, 11, 108, 36, 8, 1 },
657  { 0, 9, 93, 50, 6, 0 },
658  { 3, 16, 77, 77, 16, 3 },
659  { 0, 6, 50, 93, 9, 0 },
660  { 1, 8, 36, 108, 11, 2 },
661  { 0, 1, 12, 123, 6, 0 },
662 };
663 
664 #define MUL_20091(a) ((((a) * 20091) >> 16) + (a))
665 #define MUL_35468(a) (((a) * 35468) >> 16)
666 #endif
667 
668 #define clip_int8(n) (cm[(n) + 0x80] - 0x80)
670  ptrdiff_t stride)
671 {
672  int av_unused p1 = p[-2 * stride];
673  int av_unused p0 = p[-1 * stride];
674  int av_unused q0 = p[ 0 * stride];
675  int av_unused q1 = p[ 1 * stride];
676  int a, f1, f2;
677  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
678 
679  a = 3 * (q0 - p0);
680  a += clip_int8(p1 - q1);
681  a = clip_int8(a);
682 
683  // We deviate from the spec here with c(a+3) >> 3
684  // since that's what libvpx does.
685  f1 = FFMIN(a + 4, 127) >> 3;
686  f2 = FFMIN(a + 3, 127) >> 3;
687 
688  // Despite what the spec says, we do need to clamp here to
689  // be bitexact with libvpx.
690  p[-1 * stride] = cm[p0 + f2];
691  p[ 0 * stride] = cm[q0 - f1];
692 }
693 
695  ptrdiff_t stride)
696 {
697  int av_unused p1 = p[-2 * stride];
698  int av_unused p0 = p[-1 * stride];
699  int av_unused q0 = p[ 0 * stride];
700  int av_unused q1 = p[ 1 * stride];
701  int a, f1, f2;
702  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
703 
704  a = 3 * (q0 - p0);
705  a = clip_int8(a);
706 
707  // We deviate from the spec here with c(a+3) >> 3
708  // since that's what libvpx does.
709  f1 = FFMIN(a + 4, 127) >> 3;
710  f2 = FFMIN(a + 3, 127) >> 3;
711 
712  // Despite what the spec says, we do need to clamp here to
713  // be bitexact with libvpx.
714  p[-1 * stride] = cm[p0 + f2];
715  p[ 0 * stride] = cm[q0 - f1];
716  a = (f1 + 1) >> 1;
717  p[-2 * stride] = cm[p1 + a];
718  p[ 1 * stride] = cm[q1 - a];
719 }
720 
721 static av_always_inline int vp8_simple_limit(uint8_t *p, ptrdiff_t stride,
722  int flim)
723 {
724  int av_unused p1 = p[-2 * stride];
725  int av_unused p0 = p[-1 * stride];
726  int av_unused q0 = p[ 0 * stride];
727  int av_unused q1 = p[ 1 * stride];
728 
729  return 2 * FFABS(p0 - q0) + (FFABS(p1 - q1) >> 1) <= flim;
730 }
731 
732 static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
733 {
734  int av_unused p1 = p[-2 * stride];
735  int av_unused p0 = p[-1 * stride];
736  int av_unused q0 = p[ 0 * stride];
737  int av_unused q1 = p[ 1 * stride];
738 
739  return FFABS(p1 - p0) > thresh || FFABS(q1 - q0) > thresh;
740 }
741 
742 static av_always_inline void filter_mbedge(uint8_t *p, ptrdiff_t stride)
743 {
744  int a0, a1, a2, w;
745  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
746 
747  int av_unused p2 = p[-3 * stride];
748  int av_unused p1 = p[-2 * stride];
749  int av_unused p0 = p[-1 * stride];
750  int av_unused q0 = p[ 0 * stride];
751  int av_unused q1 = p[ 1 * stride];
752  int av_unused q2 = p[ 2 * stride];
753 
754  w = clip_int8(p1 - q1);
755  w = clip_int8(w + 3 * (q0 - p0));
756 
757  a0 = (27 * w + 63) >> 7;
758  a1 = (18 * w + 63) >> 7;
759  a2 = (9 * w + 63) >> 7;
760 
761  p[-3 * stride] = cm[p2 + a2];
762  p[-2 * stride] = cm[p1 + a1];
763  p[-1 * stride] = cm[p0 + a0];
764  p[ 0 * stride] = cm[q0 - a0];
765  p[ 1 * stride] = cm[q1 - a1];
766  p[ 2 * stride] = cm[q2 - a2];
767 }
768 
769 static av_always_inline int vp8_normal_limit(uint8_t *p, ptrdiff_t stride,
770  int E, int I)
771 {
772  int av_unused p3 = p[-4 * stride];
773  int av_unused p2 = p[-3 * stride];
774  int av_unused p1 = p[-2 * stride];
775  int av_unused p0 = p[-1 * stride];
776  int av_unused q0 = p[ 0 * stride];
777  int av_unused q1 = p[ 1 * stride];
778  int av_unused q2 = p[ 2 * stride];
779  int av_unused q3 = p[ 3 * stride];
780 
781  return vp8_simple_limit(p, stride, E) &&
782  FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
783  FFABS(p1 - p0) <= I && FFABS(q3 - q2) <= I &&
784  FFABS(q2 - q1) <= I && FFABS(q1 - q0) <= I;
785 }
786 
788  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
789 {
790  double ftmp[18];
791  uint32_t tmp[1];
796 
797  __asm__ volatile(
798  /* Get data from dst */
799  MMI_ULDC1(%[q0], %[dst], 0x0)
800  PTR_SUBU "%[tmp0], %[dst], %[stride] \n\t"
801  MMI_ULDC1(%[p0], %[tmp0], 0x0)
802  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
803  MMI_ULDC1(%[p1], %[tmp0], 0x0)
804  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
805  MMI_ULDC1(%[p2], %[tmp0], 0x0)
806  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
807  MMI_ULDC1(%[p3], %[tmp0], 0x0)
808  PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
809  MMI_ULDC1(%[q1], %[tmp0], 0x0)
810  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
811  MMI_ULDC1(%[q2], %[tmp0], 0x0)
812  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
813  MMI_ULDC1(%[q3], %[tmp0], 0x0)
815  /* Move to dst */
816  MMI_USDC1(%[q0], %[dst], 0x0)
817  PTR_SUBU "%[tmp0], %[dst], %[stride] \n\t"
818  MMI_USDC1(%[p0], %[tmp0], 0x0)
819  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
820  MMI_USDC1(%[p1], %[tmp0], 0x0)
821  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
822  MMI_USDC1(%[p2], %[tmp0], 0x0)
823  PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
824  MMI_USDC1(%[q1], %[tmp0], 0x0)
825  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
826  MMI_USDC1(%[q2], %[tmp0], 0x0)
828  [p3]"=&f"(ftmp[0]), [p2]"=&f"(ftmp[1]),
829  [p1]"=&f"(ftmp[2]), [p0]"=&f"(ftmp[3]),
830  [q0]"=&f"(ftmp[4]), [q1]"=&f"(ftmp[5]),
831  [q2]"=&f"(ftmp[6]), [q3]"=&f"(ftmp[7]),
832  [ftmp0]"=&f"(ftmp[8]), [ftmp1]"=&f"(ftmp[9]),
833  [ftmp2]"=&f"(ftmp[10]), [ftmp3]"=&f"(ftmp[11]),
834  [hev]"=&f"(ftmp[12]), [mask]"=&f"(ftmp[13]),
835  [ftmp4]"=&f"(ftmp[14]), [ftmp5]"=&f"(ftmp[15]),
836  [ftmp6]"=&f"(ftmp[16]), [ftmp7]"=&f"(ftmp[17]),
837  [dst]"+&r"(dst), [tmp0]"=&r"(tmp[0]),
840  : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
841  [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
842  : "memory"
843  );
844 }
845 
847  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
848 {
849  int i;
850 
851  for (i = 0; i < 8; i++)
852  if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
853  int hv = hev(dst + i * 1, stride, hev_thresh);
854  if (hv)
856  else
858  }
859 }
860 
862  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
863 {
864  double ftmp[18];
865  uint32_t tmp[1];
870 
871  __asm__ volatile(
872  /* Get data from dst */
873  MMI_ULDC1(%[p3], %[dst], -0x04)
874  PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
875  MMI_ULDC1(%[p2], %[tmp0], -0x04)
876  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
877  MMI_ULDC1(%[p1], %[tmp0], -0x04)
878  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
879  MMI_ULDC1(%[p0], %[tmp0], -0x04)
880  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
881  MMI_ULDC1(%[q0], %[tmp0], -0x04)
882  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
883  MMI_ULDC1(%[q1], %[tmp0], -0x04)
884  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
885  MMI_ULDC1(%[q2], %[tmp0], -0x04)
886  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
887  MMI_ULDC1(%[q3], %[tmp0], -0x04)
888  /* Matrix transpose */
889  TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
890  %[q0], %[q1], %[q2], %[q3],
891  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
893  /* Matrix transpose */
894  TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
895  %[q0], %[q1], %[q2], %[q3],
896  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
897  /* Move to dst */
898  MMI_USDC1(%[p3], %[dst], -0x04)
899  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
900  MMI_USDC1(%[p2], %[dst], -0x04)
901  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
902  MMI_USDC1(%[p1], %[dst], -0x04)
903  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
904  MMI_USDC1(%[p0], %[dst], -0x04)
905  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
906  MMI_USDC1(%[q0], %[dst], -0x04)
907  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
908  MMI_USDC1(%[q1], %[dst], -0x04)
909  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
910  MMI_USDC1(%[q2], %[dst], -0x04)
911  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
912  MMI_USDC1(%[q3], %[dst], -0x04)
914  [p3]"=&f"(ftmp[0]), [p2]"=&f"(ftmp[1]),
915  [p1]"=&f"(ftmp[2]), [p0]"=&f"(ftmp[3]),
916  [q0]"=&f"(ftmp[4]), [q1]"=&f"(ftmp[5]),
917  [q2]"=&f"(ftmp[6]), [q3]"=&f"(ftmp[7]),
918  [ftmp0]"=&f"(ftmp[8]), [ftmp1]"=&f"(ftmp[9]),
919  [ftmp2]"=&f"(ftmp[10]), [ftmp3]"=&f"(ftmp[11]),
920  [hev]"=&f"(ftmp[12]), [mask]"=&f"(ftmp[13]),
921  [ftmp4]"=&f"(ftmp[14]), [ftmp5]"=&f"(ftmp[15]),
922  [ftmp6]"=&f"(ftmp[16]), [ftmp7]"=&f"(ftmp[17]),
923  [dst]"+&r"(dst), [tmp0]"=&r"(tmp[0]),
926  : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
927  [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
928  : "memory"
929  );
930 }
931 
933  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
934 {
935  int i;
936 
937  for (i = 0; i < 8; i++)
938  if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
939  int hv = hev(dst + i * stride, 1, hev_thresh);
940  if (hv)
942  else
944  }
945 }
946 
947 void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16])
948 {
949 #if 1
950  double ftmp[8];
952 
953  __asm__ volatile (
954  MMI_LDC1(%[ftmp0], %[dc], 0x00)
955  MMI_LDC1(%[ftmp1], %[dc], 0x08)
956  MMI_LDC1(%[ftmp2], %[dc], 0x10)
957  MMI_LDC1(%[ftmp3], %[dc], 0x18)
958  "paddsh %[ftmp4], %[ftmp0], %[ftmp3] \n\t"
959  "psubsh %[ftmp5], %[ftmp0], %[ftmp3] \n\t"
960  "paddsh %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
961  "psubsh %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
962  "paddsh %[ftmp0], %[ftmp4], %[ftmp6] \n\t"
963  "paddsh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
964  "psubsh %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
965  "psubsh %[ftmp3], %[ftmp5], %[ftmp7] \n\t"
966  MMI_SDC1(%[ftmp0], %[dc], 0x00)
967  MMI_SDC1(%[ftmp1], %[dc], 0x08)
968  MMI_SDC1(%[ftmp2], %[dc], 0x10)
969  MMI_SDC1(%[ftmp3], %[dc], 0x18)
970  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
971  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
972  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
973  [ftmp6]"=&f"(ftmp[6]),
975  [ftmp7]"=&f"(ftmp[7])
976  : [dc]"r"((uint8_t*)dc)
977  : "memory"
978  );
979 
980  block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
981  block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
982  block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
983  block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
984 
985  block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
986  block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
987  block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
988  block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
989 
990  block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
991  block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
992  block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
993  block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
994 
995  block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
996  block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
997  block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
998  block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
999 
1000  __asm__ volatile (
1001  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1002  MMI_SDC1(%[ftmp0], %[dc], 0x00)
1003  MMI_SDC1(%[ftmp0], %[dc], 0x08)
1004  MMI_SDC1(%[ftmp0], %[dc], 0x10)
1005  MMI_SDC1(%[ftmp0], %[dc], 0x18)
1007  [ftmp0]"=&f"(ftmp[0])
1008  : [dc]"r"((uint8_t *)dc)
1009  : "memory"
1010  );
1011 #else
1012  int t00, t01, t02, t03, t10, t11, t12, t13, t20, t21, t22, t23, t30, t31, t32, t33;
1013 
1014  t00 = dc[0] + dc[12];
1015  t10 = dc[1] + dc[13];
1016  t20 = dc[2] + dc[14];
1017  t30 = dc[3] + dc[15];
1018 
1019  t03 = dc[0] - dc[12];
1020  t13 = dc[1] - dc[13];
1021  t23 = dc[2] - dc[14];
1022  t33 = dc[3] - dc[15];
1023 
1024  t01 = dc[4] + dc[ 8];
1025  t11 = dc[5] + dc[ 9];
1026  t21 = dc[6] + dc[10];
1027  t31 = dc[7] + dc[11];
1028 
1029  t02 = dc[4] - dc[ 8];
1030  t12 = dc[5] - dc[ 9];
1031  t22 = dc[6] - dc[10];
1032  t32 = dc[7] - dc[11];
1033 
1034  dc[ 0] = t00 + t01;
1035  dc[ 1] = t10 + t11;
1036  dc[ 2] = t20 + t21;
1037  dc[ 3] = t30 + t31;
1038 
1039  dc[ 4] = t03 + t02;
1040  dc[ 5] = t13 + t12;
1041  dc[ 6] = t23 + t22;
1042  dc[ 7] = t33 + t32;
1043 
1044  dc[ 8] = t00 - t01;
1045  dc[ 9] = t10 - t11;
1046  dc[10] = t20 - t21;
1047  dc[11] = t30 - t31;
1048 
1049  dc[12] = t03 - t02;
1050  dc[13] = t13 - t12;
1051  dc[14] = t23 - t22;
1052  dc[15] = t33 - t32;
1053 
1054  block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
1055  block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
1056  block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
1057  block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
1058 
1059  block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
1060  block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
1061  block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
1062  block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
1063 
1064  block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
1065  block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
1066  block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
1067  block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
1068 
1069  block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
1070  block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
1071  block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
1072  block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
1073 
1074  AV_ZERO64(dc + 0);
1075  AV_ZERO64(dc + 4);
1076  AV_ZERO64(dc + 8);
1077  AV_ZERO64(dc + 12);
1078 #endif
1079 }
1080 
1081 void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16])
1082 {
1083  int val = (dc[0] + 3) >> 3;
1084 
1085  dc[0] = 0;
1086 
1087  block[0][0][0] = val;
1088  block[0][1][0] = val;
1089  block[0][2][0] = val;
1090  block[0][3][0] = val;
1091  block[1][0][0] = val;
1092  block[1][1][0] = val;
1093  block[1][2][0] = val;
1094  block[1][3][0] = val;
1095  block[2][0][0] = val;
1096  block[2][1][0] = val;
1097  block[2][2][0] = val;
1098  block[2][3][0] = val;
1099  block[3][0][0] = val;
1100  block[3][1][0] = val;
1101  block[3][2][0] = val;
1102  block[3][3][0] = val;
1103 }
1104 
1105 void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1106 {
1107 #if 1
1108  double ftmp[12];
1109  uint32_t tmp[1];
1110  union av_intfloat64 ff_ph_4e7b_u;
1111  union av_intfloat64 ff_ph_22a3_u;
1114  ff_ph_4e7b_u.i = 0x4e7b4e7b4e7b4e7bULL;
1115  ff_ph_22a3_u.i = 0x22a322a322a322a3ULL;
1116 
1117  __asm__ volatile (
1118  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1119  MMI_LDC1(%[ftmp1], %[block], 0x00)
1120  MMI_LDC1(%[ftmp2], %[block], 0x08)
1121  MMI_LDC1(%[ftmp3], %[block], 0x10)
1122  MMI_LDC1(%[ftmp4], %[block], 0x18)
1123 
1124  "li %[tmp0], 0x02 \n\t"
1125  "mtc1 %[tmp0], %[ftmp11] \n\t"
1126 
1127  // block[0...3] + block[8...11]
1128  "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
1129  // block[0...3] - block[8...11]
1130  "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
1131  // MUL_35468(block[12...15])
1132  "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
1133  "pmulhh %[ftmp7], %[ftmp9], %[ff_ph_22a3] \n\t"
1134  // MUL_35468(block[4...7])
1135  "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
1136  "pmulhh %[ftmp8], %[ftmp9], %[ff_ph_22a3] \n\t"
1137  // MUL_20091(block[4...7]
1138  "pmulhh %[ftmp9], %[ftmp2], %[ff_ph_4e7b] \n\t"
1139  "paddh %[ftmp9], %[ftmp9], %[ftmp2] \n\t"
1140  // MUL_20091(block[12...15])
1141  "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
1142  "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t"
1143 
1144  // tmp[0 4 8 12]
1145  "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
1146  "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
1147  // tmp[1 5 9 13]
1148  "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t"
1149  "psubh %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
1150  // tmp[2 6 10 14]
1151  "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t"
1152  "paddh %[ftmp3], %[ftmp3], %[ftmp10] \n\t"
1153  // tmp[3 7 11 15]
1154  "psubh %[ftmp4], %[ftmp5], %[ftmp7] \n\t"
1155  "psubh %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
1156 
1157  MMI_SDC1(%[ftmp0], %[block], 0x00)
1158  MMI_SDC1(%[ftmp0], %[block], 0x08)
1159  MMI_SDC1(%[ftmp0], %[block], 0x10)
1160  MMI_SDC1(%[ftmp0], %[block], 0x18)
1161 
1162  TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1163  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1164 
1165  // t[0 4 8 12]
1166  "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
1167  // t[1 5 9 13]
1168  "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
1169  // t[2 6 10 14]
1170  "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
1171  "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
1172  "psubh %[ftmp7], %[ftmp9], %[ftmp4] \n\t"
1173  "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
1174  "psubh %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1175  // t[3 7 11 15]
1176  "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
1177  "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
1178  "paddh %[ftmp8], %[ftmp9], %[ftmp2] \n\t"
1179  "pmulhh %[ftmp10], %[ftmp2], %[ff_ph_4e7b] \n\t"
1180  "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
1181 
1182  "li %[tmp0], 0x03 \n\t"
1183  "mtc1 %[tmp0], %[ftmp11] \n\t"
1184  "paddh %[ftmp1], %[ftmp5], %[ftmp8] \n\t"
1185  "paddh %[ftmp1], %[ftmp1], %[ff_pw_4] \n\t"
1186  "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
1187  "paddh %[ftmp2], %[ftmp6], %[ftmp7] \n\t"
1188  "paddh %[ftmp2], %[ftmp2], %[ff_pw_4] \n\t"
1189  "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
1190  "psubh %[ftmp3], %[ftmp6], %[ftmp7] \n\t"
1191  "paddh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t"
1192  "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
1193  "psubh %[ftmp4], %[ftmp5], %[ftmp8] \n\t"
1194  "paddh %[ftmp4], %[ftmp4], %[ff_pw_4] \n\t"
1195  "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
1196 
1197  TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1198  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1199 
1200  MMI_LWC1(%[ftmp5], %[dst0], 0x00)
1201  MMI_LWC1(%[ftmp6], %[dst1], 0x00)
1202  MMI_LWC1(%[ftmp7], %[dst2], 0x00)
1203  MMI_LWC1(%[ftmp8], %[dst3], 0x00)
1204 
1205  "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1206  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1207  "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1208  "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1209 
1210  "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1211  "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
1212  "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
1213  "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
1214 
1215  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1216  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1217  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1218  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1219 
1220  MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1221  MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1222  MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1223  MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1224  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1225  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1226  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1227  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1228  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1229  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1232  [tmp0]"=&r"(tmp[0])
1233  : [dst0]"r"(dst), [dst1]"r"(dst+stride),
1234  [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
1235  [block]"r"(block), [ff_pw_4]"f"(ff_pw_4.f),
1236  [ff_ph_4e7b]"f"(ff_ph_4e7b_u.f), [ff_ph_22a3]"f"(ff_ph_22a3_u.f)
1237  : "memory"
1238  );
1239 #else
1240  int i, t0, t1, t2, t3;
1241  int16_t tmp[16];
1242 
1243  for (i = 0; i < 4; i++) {
1244  t0 = block[0 + i] + block[8 + i];
1245  t1 = block[0 + i] - block[8 + i];
1246  t2 = MUL_35468(block[4 + i]) - MUL_20091(block[12 + i]);
1247  t3 = MUL_20091(block[4 + i]) + MUL_35468(block[12 + i]);
1248  block[ 0 + i] = 0;
1249  block[ 4 + i] = 0;
1250  block[ 8 + i] = 0;
1251  block[12 + i] = 0;
1252 
1253  tmp[i * 4 + 0] = t0 + t3;
1254  tmp[i * 4 + 1] = t1 + t2;
1255  tmp[i * 4 + 2] = t1 - t2;
1256  tmp[i * 4 + 3] = t0 - t3;
1257  }
1258 
1259  for (i = 0; i < 4; i++) {
1260  t0 = tmp[0 + i] + tmp[8 + i];
1261  t1 = tmp[0 + i] - tmp[8 + i];
1262  t2 = MUL_35468(tmp[4 + i]) - MUL_20091(tmp[12 + i]);
1263  t3 = MUL_20091(tmp[4 + i]) + MUL_35468(tmp[12 + i]);
1264 
1265  dst[0] = av_clip_uint8(dst[0] + ((t0 + t3 + 4) >> 3));
1266  dst[1] = av_clip_uint8(dst[1] + ((t1 + t2 + 4) >> 3));
1267  dst[2] = av_clip_uint8(dst[2] + ((t1 - t2 + 4) >> 3));
1268  dst[3] = av_clip_uint8(dst[3] + ((t0 - t3 + 4) >> 3));
1269  dst += stride;
1270  }
1271 #endif
1272 }
1273 
1274 void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1275 {
1276 #if 1
1277  int dc = (block[0] + 4) >> 3;
1278  double ftmp[6];
1280 
1281  block[0] = 0;
1282 
1283  __asm__ volatile (
1284  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1285  "mtc1 %[dc], %[ftmp5] \n\t"
1286  MMI_LWC1(%[ftmp1], %[dst0], 0x00)
1287  MMI_LWC1(%[ftmp2], %[dst1], 0x00)
1288  MMI_LWC1(%[ftmp3], %[dst2], 0x00)
1289  MMI_LWC1(%[ftmp4], %[dst3], 0x00)
1290  "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1291  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1292  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1293  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1294  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1295  "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1296  "paddsh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
1297  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
1298  "paddsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1299  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1300  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1301  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1302  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1303  MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1304  MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1305  MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1306  MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1307  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1308  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1309  [ftmp4]"=&f"(ftmp[4]),
1311  [ftmp5]"=&f"(ftmp[5])
1312  : [dst0]"r"(dst), [dst1]"r"(dst+stride),
1313  [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
1314  [dc]"r"(dc)
1315  : "memory"
1316  );
1317 #else
1318  int i, dc = (block[0] + 4) >> 3;
1319 
1320  block[0] = 0;
1321 
1322  for (i = 0; i < 4; i++) {
1323  dst[0] = av_clip_uint8(dst[0] + dc);
1324  dst[1] = av_clip_uint8(dst[1] + dc);
1325  dst[2] = av_clip_uint8(dst[2] + dc);
1326  dst[3] = av_clip_uint8(dst[3] + dc);
1327  dst += stride;
1328  }
1329 #endif
1330 }
1331 
1332 void ff_vp8_idct_dc_add4y_mmi(uint8_t *dst, int16_t block[4][16],
1333  ptrdiff_t stride)
1334 {
1339 }
1340 
1341 void ff_vp8_idct_dc_add4uv_mmi(uint8_t *dst, int16_t block[4][16],
1342  ptrdiff_t stride)
1343 {
1344  ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 0, block[0], stride);
1345  ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 4, block[1], stride);
1346  ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 0, block[2], stride);
1347  ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 4, block[3], stride);
1348 }
1349 
1350 // loop filter applied to edges between macroblocks
1351 void ff_vp8_v_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1352  int flim_I, int hev_thresh)
1353 {
1354  vp8_v_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1355  vp8_v_loop_filter8_mmi(dst + 8, stride, flim_E, flim_I, hev_thresh);
1356 }
1357 
1358 void ff_vp8_h_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1359  int flim_I, int hev_thresh)
1360 {
1361  vp8_h_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1362  vp8_h_loop_filter8_mmi(dst + 8 * stride, stride, flim_E, flim_I,
1363  hev_thresh);
1364 }
1365 
1366 void ff_vp8_v_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1367  int flim_E, int flim_I, int hev_thresh)
1368 {
1369  vp8_v_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1370  vp8_v_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1371 }
1372 
1373 void ff_vp8_h_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1374  int flim_E, int flim_I, int hev_thresh)
1375 {
1376  vp8_h_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1377  vp8_h_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1378 }
1379 
1380 // loop filter applied to inner macroblock edges
1382  int flim_E, int flim_I, int hev_thresh)
1383 {
1384  int i;
1385 
1386  for (i = 0; i < 16; i++)
1387  if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
1388  int hv = hev(dst + i * 1, stride, hev_thresh);
1389  if (hv)
1391  else
1393  }
1394 }
1395 
1397  int flim_E, int flim_I, int hev_thresh)
1398 {
1399  int i;
1400 
1401  for (i = 0; i < 16; i++)
1402  if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
1403  int hv = hev(dst + i * stride, 1, hev_thresh);
1404  if (hv)
1406  else
1408  }
1409 }
1410 
1411 void ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1412  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1413 {
1414  vp8_v_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1415  vp8_v_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1416 }
1417 
1418 void ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1419  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1420 {
1421  vp8_h_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1422  vp8_h_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1423 }
1424 
1425 void ff_vp8_v_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1426 {
1427  int i;
1428 
1429  for (i = 0; i < 16; i++)
1430  if (vp8_simple_limit(dst + i, stride, flim))
1432 }
1433 
1434 void ff_vp8_h_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1435 {
1436  int i;
1437 
1438  for (i = 0; i < 16; i++)
1439  if (vp8_simple_limit(dst + i * stride, 1, flim))
1441 }
1442 
1443 void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1444  ptrdiff_t srcstride, int h, int x, int y)
1445 {
1446 #if 1
1447  double ftmp[2];
1448  uint64_t tmp[2];
1449  mips_reg addr[2];
1451 
1452  __asm__ volatile (
1453  "1: \n\t"
1454  PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1455  MMI_ULDC1(%[ftmp0], %[src], 0x00)
1456  "ldl %[tmp0], 0x0f(%[src]) \n\t"
1457  "ldr %[tmp0], 0x08(%[src]) \n\t"
1458  MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
1459  "ldl %[tmp1], 0x0f(%[addr0]) \n\t"
1460  "ldr %[tmp1], 0x08(%[addr0]) \n\t"
1461  PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1462  MMI_SDC1(%[ftmp0], %[dst], 0x00)
1463  "sdl %[tmp0], 0x0f(%[dst]) \n\t"
1464  "sdr %[tmp0], 0x08(%[dst]) \n\t"
1465  "addiu %[h], %[h], -0x02 \n\t"
1466  MMI_SDC1(%[ftmp1], %[addr1], 0x00)
1467  PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1468  "sdl %[tmp1], 0x0f(%[addr1]) \n\t"
1469  "sdr %[tmp1], 0x08(%[addr1]) \n\t"
1470  PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1471  "bnez %[h], 1b \n\t"
1472  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1473  [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
1475  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1476  [dst]"+&r"(dst), [src]"+&r"(src),
1477  [h]"+&r"(h)
1478  : [dststride]"r"((mips_reg)dststride),
1480  : "memory"
1481  );
1482 #else
1483  int i;
1484 
1485  for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1486  memcpy(dst, src, 16);
1487 #endif
1488 }
1489 
1490 void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1491  ptrdiff_t srcstride, int h, int x, int y)
1492 {
1493 #if 1
1494  double ftmp[1];
1495  uint64_t tmp[1];
1496  mips_reg addr[2];
1498 
1499  __asm__ volatile (
1500  "1: \n\t"
1501  PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1502  MMI_ULDC1(%[ftmp0], %[src], 0x00)
1503  "ldl %[tmp0], 0x07(%[addr0]) \n\t"
1504  "ldr %[tmp0], 0x00(%[addr0]) \n\t"
1505  PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1506  MMI_SDC1(%[ftmp0], %[dst], 0x00)
1507  "addiu %[h], %[h], -0x02 \n\t"
1508  "sdl %[tmp0], 0x07(%[addr1]) \n\t"
1509  "sdr %[tmp0], 0x00(%[addr1]) \n\t"
1510  PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1511  PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1512  "bnez %[h], 1b \n\t"
1513  : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
1515  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1516  [dst]"+&r"(dst), [src]"+&r"(src),
1517  [h]"+&r"(h)
1518  : [dststride]"r"((mips_reg)dststride),
1520  : "memory"
1521  );
1522 #else
1523  int i;
1524 
1525  for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1526  memcpy(dst, src, 8);
1527 #endif
1528 }
1529 
1530 void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1531  ptrdiff_t srcstride, int h, int x, int y)
1532 {
1533 #if 1
1534  double ftmp[1];
1535  uint64_t tmp[1];
1536  mips_reg addr[2];
1538 
1539  __asm__ volatile (
1540  "1: \n\t"
1541  PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1542  MMI_LWC1(%[ftmp0], %[src], 0x00)
1543  "lwl %[tmp0], 0x03(%[addr0]) \n\t"
1544  "lwr %[tmp0], 0x00(%[addr0]) \n\t"
1545  PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1546  MMI_SWC1(%[ftmp0], %[dst], 0x00)
1547  "addiu %[h], %[h], -0x02 \n\t"
1548  "swl %[tmp0], 0x03(%[addr1]) \n\t"
1549  "swr %[tmp0], 0x00(%[addr1]) \n\t"
1550  PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1551  PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1552  "bnez %[h], 1b \n\t"
1553  : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
1555  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1556  [dst]"+&r"(dst), [src]"+&r"(src),
1557  [h]"+&r"(h)
1558  : [dststride]"r"((mips_reg)dststride),
1560  : "memory"
1561  );
1562 #else
1563  int i;
1564 
1565  for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1566  memcpy(dst, src, 4);
1567 #endif
1568 }
1569 
1570 void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1571  ptrdiff_t srcstride, int h, int mx, int my)
1572 {
1573 #if 1
1574  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1575  double ftmp[9];
1576  uint32_t tmp[1];
1577  union av_intfloat64 filter1;
1578  union av_intfloat64 filter2;
1579  union av_intfloat64 filter3;
1580  union av_intfloat64 filter4;
1581  mips_reg src1, dst1;
1583  filter1.i = filter[1];
1584  filter2.i = filter[2];
1585  filter3.i = filter[3];
1586  filter4.i = filter[4];
1587 
1588  /*
1589  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1590  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1591  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1592  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1593  dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1594  dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1595  dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1596  dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1597 
1598  dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 7] + filter[3] * src[ 9] - filter[4] * src[10] + 64) >> 7];
1599  dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 8] + filter[3] * src[10] - filter[4] * src[11] + 64) >> 7];
1600  dst[10] = cm[(filter[2] * src[10] - filter[1] * src[ 9] + filter[3] * src[11] - filter[4] * src[12] + 64) >> 7];
1601  dst[11] = cm[(filter[2] * src[11] - filter[1] * src[10] + filter[3] * src[12] - filter[4] * src[13] + 64) >> 7];
1602  dst[12] = cm[(filter[2] * src[12] - filter[1] * src[11] + filter[3] * src[13] - filter[4] * src[14] + 64) >> 7];
1603  dst[13] = cm[(filter[2] * src[13] - filter[1] * src[12] + filter[3] * src[14] - filter[4] * src[15] + 64) >> 7];
1604  dst[14] = cm[(filter[2] * src[14] - filter[1] * src[13] + filter[3] * src[15] - filter[4] * src[16] + 64) >> 7];
1605  dst[15] = cm[(filter[2] * src[15] - filter[1] * src[14] + filter[3] * src[16] - filter[4] * src[17] + 64) >> 7];
1606  */
1607  __asm__ volatile (
1608  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1609  "li %[tmp0], 0x07 \n\t"
1610  "mtc1 %[tmp0], %[ftmp4] \n\t"
1611 
1612  "1: \n\t"
1613  // 0 - 7
1614  PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1615  PTR_ADDIU "%[src1], %[src], 0x08 \n\t"
1616  PTR_ADDIU "%[dst1], %[dst], 0x08 \n\t"
1617  // 8 - 15
1618  PUT_VP8_EPEL8_H4_MMI(%[src1], %[dst1])
1619 
1620  "addiu %[h], %[h], -0x01 \n\t"
1621  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1622  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1623  "bnez %[h], 1b \n\t"
1624  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1625  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1626  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1627  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1628  [ftmp8]"=&f"(ftmp[8]),
1629  [tmp0]"=&r"(tmp[0]),
1631  [dst1]"=&r"(dst1), [src1]"=&r"(src1),
1632  [h]"+&r"(h),
1633  [dst]"+&r"(dst), [src]"+&r"(src)
1634  : [ff_pw_64]"f"(ff_pw_64.f),
1635  [srcstride]"r"((mips_reg)srcstride),
1636  [dststride]"r"((mips_reg)dststride),
1637  [filter1]"f"(filter1.f), [filter2]"f"(filter2.f),
1638  [filter3]"f"(filter3.f), [filter4]"f"(filter4.f)
1639  : "memory"
1640  );
1641 #else
1642  const uint8_t *filter = subpel_filters[mx - 1];
1643  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1644  int x, y;
1645 
1646  for (y = 0; y < h; y++) {
1647  for (x = 0; x < 16; x++)
1648  dst[x] = FILTER_4TAP(src, filter, 1);
1649  dst += dststride;
1650  src += srcstride;
1651  }
1652 #endif
1653 }
1654 
1655 void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1656  ptrdiff_t srcstride, int h, int mx, int my)
1657 {
1658 #if 1
1659  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1660  double ftmp[9];
1661  uint32_t tmp[1];
1662  union av_intfloat64 filter1;
1663  union av_intfloat64 filter2;
1664  union av_intfloat64 filter3;
1665  union av_intfloat64 filter4;
1667  filter1.i = filter[1];
1668  filter2.i = filter[2];
1669  filter3.i = filter[3];
1670  filter4.i = filter[4];
1671 
1672 
1673  /*
1674  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1675  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1676  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1677  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1678  dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1679  dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1680  dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1681  dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1682  */
1683  __asm__ volatile (
1684  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1685  "li %[tmp0], 0x07 \n\t"
1686  "mtc1 %[tmp0], %[ftmp4] \n\t"
1687 
1688  "1: \n\t"
1689  PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1690 
1691  "addiu %[h], %[h], -0x01 \n\t"
1692  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1693  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1694  "bnez %[h], 1b \n\t"
1695  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1696  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1697  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1698  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1699  [ftmp8]"=&f"(ftmp[8]),
1700  [tmp0]"=&r"(tmp[0]),
1702  [h]"+&r"(h),
1703  [dst]"+&r"(dst), [src]"+&r"(src)
1704  : [ff_pw_64]"f"(ff_pw_64.f),
1705  [srcstride]"r"((mips_reg)srcstride),
1706  [dststride]"r"((mips_reg)dststride),
1707  [filter1]"f"(filter1.f), [filter2]"f"(filter2.f),
1708  [filter3]"f"(filter3.f), [filter4]"f"(filter4.f)
1709  : "memory"
1710  );
1711 #else
1712  const uint8_t *filter = subpel_filters[mx - 1];
1713  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1714  int x, y;
1715 
1716  for (y = 0; y < h; y++) {
1717  for (x = 0; x < 8; x++)
1718  dst[x] = FILTER_4TAP(src, filter, 1);
1719  dst += dststride;
1720  src += srcstride;
1721  }
1722 #endif
1723 }
1724 
1725 void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1726  ptrdiff_t srcstride, int h, int mx, int my)
1727 {
1728 #if 1
1729  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1730  double ftmp[6];
1731  uint32_t tmp[1];
1732  union av_intfloat64 filter1;
1733  union av_intfloat64 filter2;
1734  union av_intfloat64 filter3;
1735  union av_intfloat64 filter4;
1737  filter1.i = filter[1];
1738  filter2.i = filter[2];
1739  filter3.i = filter[3];
1740  filter4.i = filter[4];
1741 
1742  /*
1743  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1744  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1745  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1746  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1747  */
1748  __asm__ volatile (
1749  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1750  "li %[tmp0], 0x07 \n\t"
1751  "mtc1 %[tmp0], %[ftmp4] \n\t"
1752 
1753  "1: \n\t"
1754  PUT_VP8_EPEL4_H4_MMI(%[src], %[dst])
1755 
1756  "addiu %[h], %[h], -0x01 \n\t"
1757  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1758  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1759  "bnez %[h], 1b \n\t"
1760  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1761  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1762  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1763  [tmp0]"=&r"(tmp[0]),
1765  [h]"+&r"(h),
1766  [dst]"+&r"(dst), [src]"+&r"(src)
1767  : [ff_pw_64]"f"(ff_pw_64.f),
1768  [srcstride]"r"((mips_reg)srcstride),
1769  [dststride]"r"((mips_reg)dststride),
1770  [filter1]"f"(filter1.f), [filter2]"f"(filter2.f),
1771  [filter3]"f"(filter3.f), [filter4]"f"(filter4.f)
1772  : "memory"
1773  );
1774 #else
1775  const uint8_t *filter = subpel_filters[mx - 1];
1776  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1777  int x, y;
1778 
1779  for (y = 0; y < h; y++) {
1780  for (x = 0; x < 4; x++)
1781  dst[x] = FILTER_4TAP(src, filter, 1);
1782  dst += dststride;
1783  src += srcstride;
1784  }
1785 #endif
1786 }
1787 
1788 void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1789  ptrdiff_t srcstride, int h, int mx, int my)
1790 {
1791 #if 1
1792  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1793  double ftmp[9];
1794  uint32_t tmp[1];
1795  mips_reg src1, dst1;
1796  union av_intfloat64 filter0;
1797  union av_intfloat64 filter1;
1798  union av_intfloat64 filter2;
1799  union av_intfloat64 filter3;
1800  union av_intfloat64 filter4;
1801  union av_intfloat64 filter5;
1803  filter0.i = filter[0];
1804  filter1.i = filter[1];
1805  filter2.i = filter[2];
1806  filter3.i = filter[3];
1807  filter4.i = filter[4];
1808  filter5.i = filter[5];
1809 
1810  /*
1811  dst[ 0] = cm[(filter[2]*src[ 0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[ 1] - filter[4]*src[ 2] + filter[5]*src[ 3] + 64) >> 7];
1812  dst[ 1] = cm[(filter[2]*src[ 1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[ 2] - filter[4]*src[ 3] + filter[5]*src[ 4] + 64) >> 7];
1813  dst[ 2] = cm[(filter[2]*src[ 2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[ 3] - filter[4]*src[ 4] + filter[5]*src[ 5] + 64) >> 7];
1814  dst[ 3] = cm[(filter[2]*src[ 3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[ 4] - filter[4]*src[ 5] + filter[5]*src[ 6] + 64) >> 7];
1815  dst[ 4] = cm[(filter[2]*src[ 4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[ 5] - filter[4]*src[ 6] + filter[5]*src[ 7] + 64) >> 7];
1816  dst[ 5] = cm[(filter[2]*src[ 5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[ 6] - filter[4]*src[ 7] + filter[5]*src[ 8] + 64) >> 7];
1817  dst[ 6] = cm[(filter[2]*src[ 6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[ 7] - filter[4]*src[ 8] + filter[5]*src[ 9] + 64) >> 7];
1818  dst[ 7] = cm[(filter[2]*src[ 7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[ 8] - filter[4]*src[ 9] + filter[5]*src[10] + 64) >> 7];
1819 
1820  dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 7] + filter[0]*src[ 6] + filter[3]*src[ 9] - filter[4]*src[10] + filter[5]*src[11] + 64) >> 7];
1821  dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 8] + filter[0]*src[ 7] + filter[3]*src[10] - filter[4]*src[11] + filter[5]*src[12] + 64) >> 7];
1822  dst[10] = cm[(filter[2]*src[10] - filter[1]*src[ 9] + filter[0]*src[ 8] + filter[3]*src[11] - filter[4]*src[12] + filter[5]*src[13] + 64) >> 7];
1823  dst[11] = cm[(filter[2]*src[11] - filter[1]*src[10] + filter[0]*src[ 9] + filter[3]*src[12] - filter[4]*src[13] + filter[5]*src[14] + 64) >> 7];
1824  dst[12] = cm[(filter[2]*src[12] - filter[1]*src[11] + filter[0]*src[10] + filter[3]*src[13] - filter[4]*src[14] + filter[5]*src[15] + 64) >> 7];
1825  dst[13] = cm[(filter[2]*src[13] - filter[1]*src[12] + filter[0]*src[11] + filter[3]*src[14] - filter[4]*src[15] + filter[5]*src[16] + 64) >> 7];
1826  dst[14] = cm[(filter[2]*src[14] - filter[1]*src[13] + filter[0]*src[12] + filter[3]*src[15] - filter[4]*src[16] + filter[5]*src[17] + 64) >> 7];
1827  dst[15] = cm[(filter[2]*src[15] - filter[1]*src[14] + filter[0]*src[13] + filter[3]*src[16] - filter[4]*src[17] + filter[5]*src[18] + 64) >> 7];
1828  */
1829  __asm__ volatile (
1830  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1831  "li %[tmp0], 0x07 \n\t"
1832  "mtc1 %[tmp0], %[ftmp4] \n\t"
1833 
1834  "1: \n\t"
1835  // 0 - 7
1836  PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1837  PTR_ADDIU "%[src1], %[src], 0x08 \n\t"
1838  PTR_ADDIU "%[dst1], %[dst], 0x08 \n\t"
1839  // 8 - 15
1840  PUT_VP8_EPEL8_H6_MMI(%[src1], %[dst1])
1841 
1842  "addiu %[h], %[h], -0x01 \n\t"
1843  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1844  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1845  "bnez %[h], 1b \n\t"
1846  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1847  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1848  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1849  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1850  [ftmp8]"=&f"(ftmp[8]),
1851  [tmp0]"=&r"(tmp[0]),
1853  [dst1]"=&r"(dst1), [src1]"=&r"(src1),
1854  [h]"+&r"(h),
1855  [dst]"+&r"(dst), [src]"+&r"(src)
1856  : [ff_pw_64]"f"(ff_pw_64.f),
1857  [srcstride]"r"((mips_reg)srcstride),
1858  [dststride]"r"((mips_reg)dststride),
1859  [filter0]"f"(filter0.f), [filter1]"f"(filter1.f),
1860  [filter2]"f"(filter2.f), [filter3]"f"(filter3.f),
1861  [filter4]"f"(filter4.f), [filter5]"f"(filter5.f)
1862  : "memory"
1863  );
1864 #else
1865  const uint8_t *filter = subpel_filters[mx - 1];
1866  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1867  int x, y;
1868 
1869  for (y = 0; y < h; y++) {
1870  for (x = 0; x < 16; x++)
1871  dst[x] = FILTER_6TAP(src, filter, 1);
1872  dst += dststride;
1873  src += srcstride;
1874  }
1875 #endif
1876 }
1877 
1878 void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1879  ptrdiff_t srcstride, int h, int mx, int my)
1880 {
1881 #if 1
1882  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1883  double ftmp[9];
1884  uint32_t tmp[1];
1885  union av_intfloat64 filter0;
1886  union av_intfloat64 filter1;
1887  union av_intfloat64 filter2;
1888  union av_intfloat64 filter3;
1889  union av_intfloat64 filter4;
1890  union av_intfloat64 filter5;
1892  filter0.i = filter[0];
1893  filter1.i = filter[1];
1894  filter2.i = filter[2];
1895  filter3.i = filter[3];
1896  filter4.i = filter[4];
1897  filter5.i = filter[5];
1898 
1899  /*
1900  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1901  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1902  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1903  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1904  dst[4] = cm[(filter[2]*src[4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[5] - filter[4]*src[6] + filter[5]*src[ 7] + 64) >> 7];
1905  dst[5] = cm[(filter[2]*src[5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[6] - filter[4]*src[7] + filter[5]*src[ 8] + 64) >> 7];
1906  dst[6] = cm[(filter[2]*src[6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[7] - filter[4]*src[8] + filter[5]*src[ 9] + 64) >> 7];
1907  dst[7] = cm[(filter[2]*src[7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[8] - filter[4]*src[9] + filter[5]*src[10] + 64) >> 7];
1908  */
1909  __asm__ volatile (
1910  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1911  "li %[tmp0], 0x07 \n\t"
1912  "mtc1 %[tmp0], %[ftmp4] \n\t"
1913 
1914  "1: \n\t"
1915  PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1916 
1917  "addiu %[h], %[h], -0x01 \n\t"
1918  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1919  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1920  "bnez %[h], 1b \n\t"
1921  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1922  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1923  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1924  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1925  [ftmp8]"=&f"(ftmp[8]),
1926  [tmp0]"=&r"(tmp[0]),
1928  [h]"+&r"(h),
1929  [dst]"+&r"(dst), [src]"+&r"(src)
1930  : [ff_pw_64]"f"(ff_pw_64.f),
1931  [srcstride]"r"((mips_reg)srcstride),
1932  [dststride]"r"((mips_reg)dststride),
1933  [filter0]"f"(filter0.f), [filter1]"f"(filter1.f),
1934  [filter2]"f"(filter2.f), [filter3]"f"(filter3.f),
1935  [filter4]"f"(filter4.f), [filter5]"f"(filter5.f)
1936  : "memory"
1937  );
1938 #else
1939  const uint8_t *filter = subpel_filters[mx - 1];
1940  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1941  int x, y;
1942 
1943  for (y = 0; y < h; y++) {
1944  for (x = 0; x < 8; x++)
1945  dst[x] = FILTER_6TAP(src, filter, 1);
1946  dst += dststride;
1947  src += srcstride;
1948  }
1949 #endif
1950 }
1951 
1952 void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1953  ptrdiff_t srcstride, int h, int mx, int my)
1954 {
1955 #if 1
1956  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1957  double ftmp[6];
1958  uint32_t tmp[1];
1959  union av_intfloat64 filter0;
1960  union av_intfloat64 filter1;
1961  union av_intfloat64 filter2;
1962  union av_intfloat64 filter3;
1963  union av_intfloat64 filter4;
1964  union av_intfloat64 filter5;
1966  filter0.i = filter[0];
1967  filter1.i = filter[1];
1968  filter2.i = filter[2];
1969  filter3.i = filter[3];
1970  filter4.i = filter[4];
1971  filter5.i = filter[5];
1972 
1973  /*
1974  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1975  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1976  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1977  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1978  */
1979  __asm__ volatile (
1980  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1981  "li %[tmp0], 0x07 \n\t"
1982  "mtc1 %[tmp0], %[ftmp4] \n\t"
1983 
1984  "1: \n\t"
1985  PUT_VP8_EPEL4_H6_MMI(%[src], %[dst])
1986 
1987  "addiu %[h], %[h], -0x01 \n\t"
1988  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1989  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1990  "bnez %[h], 1b \n\t"
1991  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1992  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1993  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1994  [tmp0]"=&r"(tmp[0]),
1996  [h]"+&r"(h),
1997  [dst]"+&r"(dst), [src]"+&r"(src)
1998  : [ff_pw_64]"f"(ff_pw_64.f),
1999  [srcstride]"r"((mips_reg)srcstride),
2000  [dststride]"r"((mips_reg)dststride),
2001  [filter0]"f"(filter0.f), [filter1]"f"(filter1.f),
2002  [filter2]"f"(filter2.f), [filter3]"f"(filter3.f),
2003  [filter4]"f"(filter4.f), [filter5]"f"(filter5.f)
2004  : "memory"
2005  );
2006 #else
2007  const uint8_t *filter = subpel_filters[mx - 1];
2008  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2009  int x, y;
2010 
2011  for (y = 0; y < h; y++) {
2012  for (x = 0; x < 4; x++)
2013  dst[x] = FILTER_6TAP(src, filter, 1);
2014  dst += dststride;
2015  src += srcstride;
2016  }
2017 #endif
2018 }
2019 
2020 void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2021  ptrdiff_t srcstride, int h, int mx, int my)
2022 {
2023 #if 1
2024  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2025  double ftmp[9];
2026  uint32_t tmp[1];
2027  mips_reg src0, src1, dst0;
2028  union av_intfloat64 filter1;
2029  union av_intfloat64 filter2;
2030  union av_intfloat64 filter3;
2031  union av_intfloat64 filter4;
2033  filter1.i = filter[1];
2034  filter2.i = filter[2];
2035  filter3.i = filter[3];
2036  filter4.i = filter[4];
2037 
2038  /*
2039  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
2040  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2041  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2042  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2043  dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
2044  dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
2045  dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
2046  dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
2047 
2048  dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 8-srcstride] + filter[3] * src[ 8+srcstride] - filter[4] * src[ 8+2*srcstride] + 64) >> 7];
2049  dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 9-srcstride] + filter[3] * src[ 9+srcstride] - filter[4] * src[ 9+2*srcstride] + 64) >> 7];
2050  dst[10] = cm[(filter[2] * src[10] - filter[1] * src[10-srcstride] + filter[3] * src[10+srcstride] - filter[4] * src[10+2*srcstride] + 64) >> 7];
2051  dst[11] = cm[(filter[2] * src[11] - filter[1] * src[11-srcstride] + filter[3] * src[11+srcstride] - filter[4] * src[11+2*srcstride] + 64) >> 7];
2052  dst[12] = cm[(filter[2] * src[12] - filter[1] * src[12-srcstride] + filter[3] * src[12+srcstride] - filter[4] * src[12+2*srcstride] + 64) >> 7];
2053  dst[13] = cm[(filter[2] * src[13] - filter[1] * src[13-srcstride] + filter[3] * src[13+srcstride] - filter[4] * src[13+2*srcstride] + 64) >> 7];
2054  dst[14] = cm[(filter[2] * src[14] - filter[1] * src[14-srcstride] + filter[3] * src[14+srcstride] - filter[4] * src[14+2*srcstride] + 64) >> 7];
2055  dst[15] = cm[(filter[2] * src[15] - filter[1] * src[15-srcstride] + filter[3] * src[15+srcstride] - filter[4] * src[15+2*srcstride] + 64) >> 7];
2056  */
2057  __asm__ volatile (
2058  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2059  "li %[tmp0], 0x07 \n\t"
2060  "mtc1 %[tmp0], %[ftmp4] \n\t"
2061 
2062  "1: \n\t"
2063  // 0 - 7
2064  PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2065  PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2066  PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2067  // 8 - 15
2068  PUT_VP8_EPEL8_V4_MMI(%[src0], %[src1], %[dst], %[srcstride])
2069 
2070  "addiu %[h], %[h], -0x01 \n\t"
2071  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2072  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2073  "bnez %[h], 1b \n\t"
2074  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2075  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2076  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2077  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2078  [ftmp8]"=&f"(ftmp[8]),
2079  [tmp0]"=&r"(tmp[0]),
2081  [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2082  [src1]"=&r"(src1),
2083  [h]"+&r"(h),
2084  [dst]"+&r"(dst), [src]"+&r"(src)
2085  : [ff_pw_64]"f"(ff_pw_64.f),
2086  [srcstride]"r"((mips_reg)srcstride),
2087  [dststride]"r"((mips_reg)dststride),
2088  [filter1]"f"(filter1.f), [filter2]"f"(filter2.f),
2089  [filter3]"f"(filter3.f), [filter4]"f"(filter4.f)
2090  : "memory"
2091  );
2092 #else
2093  const uint8_t *filter = subpel_filters[my - 1];
2094  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2095  int x, y;
2096 
2097  for (y = 0; y < h; y++) {
2098  for (x = 0; x < 16; x++)
2100  dst += dststride;
2101  src += srcstride;
2102  }
2103 #endif
2104 }
2105 
2106 void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2107  ptrdiff_t srcstride, int h, int mx, int my)
2108 {
2109 #if 1
2110  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2111  double ftmp[9];
2112  uint32_t tmp[1];
2113  mips_reg src1;
2114  union av_intfloat64 filter1;
2115  union av_intfloat64 filter2;
2116  union av_intfloat64 filter3;
2117  union av_intfloat64 filter4;
2119  filter1.i = filter[1];
2120  filter2.i = filter[2];
2121  filter3.i = filter[3];
2122  filter4.i = filter[4];
2123 
2124  /*
2125  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
2126  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2127  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2128  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2129  dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
2130  dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
2131  dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
2132  dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
2133  */
2134  __asm__ volatile (
2135  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2136  "li %[tmp0], 0x07 \n\t"
2137  "mtc1 %[tmp0], %[ftmp4] \n\t"
2138 
2139  "1: \n\t"
2140  PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2141 
2142  "addiu %[h], %[h], -0x01 \n\t"
2143  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2144  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2145  "bnez %[h], 1b \n\t"
2146  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2147  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2148  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2149  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2150  [ftmp8]"=&f"(ftmp[8]),
2151  [tmp0]"=&r"(tmp[0]),
2153  [src1]"=&r"(src1),
2154  [h]"+&r"(h),
2155  [dst]"+&r"(dst), [src]"+&r"(src)
2156  : [ff_pw_64]"f"(ff_pw_64.f),
2157  [srcstride]"r"((mips_reg)srcstride),
2158  [dststride]"r"((mips_reg)dststride),
2159  [filter1]"f"(filter1.f), [filter2]"f"(filter2.f),
2160  [filter3]"f"(filter3.f), [filter4]"f"(filter4.f)
2161  : "memory"
2162  );
2163 #else
2164  const uint8_t *filter = subpel_filters[my - 1];
2165  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2166  int x, y;
2167 
2168  for (y = 0; y < h; y++) {
2169  for (x = 0; x < 8; x++)
2171  dst += dststride;
2172  src += srcstride;
2173  }
2174 #endif
2175 }
2176 
2177 void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2178  ptrdiff_t srcstride, int h, int mx, int my)
2179 {
2180 #if 1
2181  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2182  double ftmp[6];
2183  uint32_t tmp[1];
2184  mips_reg src1;
2185  union av_intfloat64 filter1;
2186  union av_intfloat64 filter2;
2187  union av_intfloat64 filter3;
2188  union av_intfloat64 filter4;
2190  filter1.i = filter[1];
2191  filter2.i = filter[2];
2192  filter3.i = filter[3];
2193  filter4.i = filter[4];
2194 
2195  /*
2196  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
2197  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2198  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2199  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2200  */
2201  __asm__ volatile (
2202  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2203  "li %[tmp0], 0x07 \n\t"
2204  "mtc1 %[tmp0], %[ftmp4] \n\t"
2205 
2206  "1: \n\t"
2207  PUT_VP8_EPEL4_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2208 
2209  "addiu %[h], %[h], -0x01 \n\t"
2210  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2211  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2212  "bnez %[h], 1b \n\t"
2213  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2214  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2215  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2216  [tmp0]"=&r"(tmp[0]),
2218  [src1]"=&r"(src1),
2219  [h]"+&r"(h),
2220  [dst]"+&r"(dst), [src]"+&r"(src)
2221  : [ff_pw_64]"f"(ff_pw_64.f),
2222  [srcstride]"r"((mips_reg)srcstride),
2223  [dststride]"r"((mips_reg)dststride),
2224  [filter1]"f"(filter1.f), [filter2]"f"(filter2.f),
2225  [filter3]"f"(filter3.f), [filter4]"f"(filter4.f)
2226  : "memory"
2227  );
2228 #else
2229  const uint8_t *filter = subpel_filters[my - 1];
2230  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2231  int x, y;
2232 
2233  for (y = 0; y < h; y++) {
2234  for (x = 0; x < 4; x++)
2236  dst += dststride;
2237  src += srcstride;
2238  }
2239 #endif
2240 }
2241 
2242 void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2243  ptrdiff_t srcstride, int h, int mx, int my)
2244 {
2245 #if 1
2246  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2247  double ftmp[9];
2248  uint32_t tmp[1];
2249  mips_reg src0, src1, dst0;
2250  union av_intfloat64 filter0;
2251  union av_intfloat64 filter1;
2252  union av_intfloat64 filter2;
2253  union av_intfloat64 filter3;
2254  union av_intfloat64 filter4;
2255  union av_intfloat64 filter5;
2257  filter0.i = filter[0];
2258  filter1.i = filter[1];
2259  filter2.i = filter[2];
2260  filter3.i = filter[3];
2261  filter4.i = filter[4];
2262  filter5.i = filter[5];
2263 
2264  /*
2265  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2266  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2267  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2268  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2269  dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2270  dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2271  dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2272  dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2273 
2274  dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 8-srcstride] + filter[0]*src[ 8-2*srcstride] + filter[3]*src[ 8+srcstride] - filter[4]*src[ 8+2*srcstride] + filter[5]*src[ 8+3*srcstride] + 64) >> 7];
2275  dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 9-srcstride] + filter[0]*src[ 9-2*srcstride] + filter[3]*src[ 9+srcstride] - filter[4]*src[ 9+2*srcstride] + filter[5]*src[ 9+3*srcstride] + 64) >> 7];
2276  dst[10] = cm[(filter[2]*src[10] - filter[1]*src[10-srcstride] + filter[0]*src[10-2*srcstride] + filter[3]*src[10+srcstride] - filter[4]*src[10+2*srcstride] + filter[5]*src[10+3*srcstride] + 64) >> 7];
2277  dst[11] = cm[(filter[2]*src[11] - filter[1]*src[11-srcstride] + filter[0]*src[11-2*srcstride] + filter[3]*src[11+srcstride] - filter[4]*src[11+2*srcstride] + filter[5]*src[11+3*srcstride] + 64) >> 7];
2278  dst[12] = cm[(filter[2]*src[12] - filter[1]*src[12-srcstride] + filter[0]*src[12-2*srcstride] + filter[3]*src[12+srcstride] - filter[4]*src[12+2*srcstride] + filter[5]*src[12+3*srcstride] + 64) >> 7];
2279  dst[13] = cm[(filter[2]*src[13] - filter[1]*src[13-srcstride] + filter[0]*src[13-2*srcstride] + filter[3]*src[13+srcstride] - filter[4]*src[13+2*srcstride] + filter[5]*src[13+3*srcstride] + 64) >> 7];
2280  dst[14] = cm[(filter[2]*src[14] - filter[1]*src[14-srcstride] + filter[0]*src[14-2*srcstride] + filter[3]*src[14+srcstride] - filter[4]*src[14+2*srcstride] + filter[5]*src[14+3*srcstride] + 64) >> 7];
2281  dst[15] = cm[(filter[2]*src[15] - filter[1]*src[15-srcstride] + filter[0]*src[15-2*srcstride] + filter[3]*src[15+srcstride] - filter[4]*src[15+2*srcstride] + filter[5]*src[15+3*srcstride] + 64) >> 7];
2282  */
2283  __asm__ volatile (
2284  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2285  "li %[tmp0], 0x07 \n\t"
2286  "mtc1 %[tmp0], %[ftmp4] \n\t"
2287 
2288  "1: \n\t"
2289  // 0 - 7
2290  PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2291  PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2292  PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2293  // 8 - 15
2294  PUT_VP8_EPEL8_V6_MMI(%[src0], %[src1], %[dst0], %[srcstride])
2295 
2296  "addiu %[h], %[h], -0x01 \n\t"
2297  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2298  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2299  "bnez %[h], 1b \n\t"
2300  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2301  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2302  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2303  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2304  [ftmp8]"=&f"(ftmp[8]),
2305  [tmp0]"=&r"(tmp[0]),
2307  [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2308  [src1]"=&r"(src1),
2309  [h]"+&r"(h),
2310  [dst]"+&r"(dst), [src]"+&r"(src)
2311  : [ff_pw_64]"f"(ff_pw_64.f),
2312  [srcstride]"r"((mips_reg)srcstride),
2313  [dststride]"r"((mips_reg)dststride),
2314  [filter0]"f"(filter0.f), [filter1]"f"(filter1.f),
2315  [filter2]"f"(filter2.f), [filter3]"f"(filter3.f),
2316  [filter4]"f"(filter4.f), [filter5]"f"(filter5.f)
2317  : "memory"
2318  );
2319 #else
2320  const uint8_t *filter = subpel_filters[my - 1];
2321  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2322  int x, y;
2323 
2324  for (y = 0; y < h; y++) {
2325  for (x = 0; x < 16; x++)
2327  dst += dststride;
2328  src += srcstride;
2329  }
2330 #endif
2331 }
2332 
2333 void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2334  ptrdiff_t srcstride, int h, int mx, int my)
2335 {
2336 #if 1
2337  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2338  double ftmp[9];
2339  uint32_t tmp[1];
2340  mips_reg src1;
2341  union av_intfloat64 filter0;
2342  union av_intfloat64 filter1;
2343  union av_intfloat64 filter2;
2344  union av_intfloat64 filter3;
2345  union av_intfloat64 filter4;
2346  union av_intfloat64 filter5;
2348  filter0.i = filter[0];
2349  filter1.i = filter[1];
2350  filter2.i = filter[2];
2351  filter3.i = filter[3];
2352  filter4.i = filter[4];
2353  filter5.i = filter[5];
2354 
2355  /*
2356  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2357  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2358  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2359  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2360  dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2361  dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2362  dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2363  dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2364  */
2365  __asm__ volatile (
2366  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2367  "li %[tmp0], 0x07 \n\t"
2368  "mtc1 %[tmp0], %[ftmp4] \n\t"
2369 
2370  "1: \n\t"
2371  PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2372 
2373  "addiu %[h], %[h], -0x01 \n\t"
2374  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2375  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2376  "bnez %[h], 1b \n\t"
2377  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2378  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2379  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2380  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2381  [ftmp8]"=&f"(ftmp[8]),
2382  [tmp0]"=&r"(tmp[0]),
2384  [src1]"=&r"(src1),
2385  [h]"+&r"(h),
2386  [dst]"+&r"(dst), [src]"+&r"(src)
2387  : [ff_pw_64]"f"(ff_pw_64.f),
2388  [srcstride]"r"((mips_reg)srcstride),
2389  [dststride]"r"((mips_reg)dststride),
2390  [filter0]"f"(filter0.f), [filter1]"f"(filter1.f),
2391  [filter2]"f"(filter2.f), [filter3]"f"(filter3.f),
2392  [filter4]"f"(filter4.f), [filter5]"f"(filter5.f)
2393  : "memory"
2394  );
2395 #else
2396  const uint8_t *filter = subpel_filters[my - 1];
2397  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2398  int x, y;
2399 
2400  for (y = 0; y < h; y++) {
2401  for (x = 0; x < 8; x++)
2403  dst += dststride;
2404  src += srcstride;
2405  }
2406 #endif
2407 }
2408 
2409 void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2410  ptrdiff_t srcstride, int h, int mx, int my)
2411 {
2412 #if 1
2413  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2414  double ftmp[6];
2415  uint32_t tmp[1];
2416  mips_reg src1;
2417  union av_intfloat64 filter0;
2418  union av_intfloat64 filter1;
2419  union av_intfloat64 filter2;
2420  union av_intfloat64 filter3;
2421  union av_intfloat64 filter4;
2422  union av_intfloat64 filter5;
2424  filter0.i = filter[0];
2425  filter1.i = filter[1];
2426  filter2.i = filter[2];
2427  filter3.i = filter[3];
2428  filter4.i = filter[4];
2429  filter5.i = filter[5];
2430 
2431  /*
2432  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2433  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2434  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2435  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2436  */
2437  __asm__ volatile (
2438  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2439  "li %[tmp0], 0x07 \n\t"
2440  "mtc1 %[tmp0], %[ftmp4] \n\t"
2441 
2442  "1: \n\t"
2443  PUT_VP8_EPEL4_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2444 
2445  "addiu %[h], %[h], -0x01 \n\t"
2446  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2447  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2448  "bnez %[h], 1b \n\t"
2449  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2450  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2451  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2452  [tmp0]"=&r"(tmp[0]),
2454  [src1]"=&r"(src1),
2455  [h]"+&r"(h),
2456  [dst]"+&r"(dst), [src]"+&r"(src)
2457  : [ff_pw_64]"f"(ff_pw_64.f),
2458  [srcstride]"r"((mips_reg)srcstride),
2459  [dststride]"r"((mips_reg)dststride),
2460  [filter0]"f"(filter0.f), [filter1]"f"(filter1.f),
2461  [filter2]"f"(filter2.f), [filter3]"f"(filter3.f),
2462  [filter4]"f"(filter4.f), [filter5]"f"(filter5.f)
2463  : "memory"
2464  );
2465 #else
2466  const uint8_t *filter = subpel_filters[my - 1];
2467  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2468  int x, y;
2469 
2470  for (y = 0; y < h; y++) {
2471  for (x = 0; x < 4; x++)
2473  dst += dststride;
2474  src += srcstride;
2475  }
2476 #endif
2477 }
2478 
2479 void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2480  ptrdiff_t srcstride, int h, int mx, int my)
2481 {
2482 #if 1
2483  DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2484  uint8_t *tmp = tmp_array;
2485 
2486  src -= srcstride;
2488  tmp = tmp_array + 16;
2489  ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2490 #else
2491  const uint8_t *filter = subpel_filters[mx - 1];
2492  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2493  int x, y;
2494  uint8_t tmp_array[560];
2495  uint8_t *tmp = tmp_array;
2496 
2497  src -= srcstride;
2498 
2499  for (y = 0; y < h + 3; y++) {
2500  for (x = 0; x < 16; x++)
2501  tmp[x] = FILTER_4TAP(src, filter, 1);
2502  tmp += 16;
2503  src += srcstride;
2504  }
2505 
2506  tmp = tmp_array + 16;
2507  filter = subpel_filters[my - 1];
2508 
2509  for (y = 0; y < h; y++) {
2510  for (x = 0; x < 16; x++)
2511  dst[x] = FILTER_4TAP(tmp, filter, 16);
2512  dst += dststride;
2513  tmp += 16;
2514  }
2515 #endif
2516 }
2517 
2518 void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2519  ptrdiff_t srcstride, int h, int mx, int my)
2520 {
2521 #if 1
2522  DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2523  uint8_t *tmp = tmp_array;
2524 
2525  src -= srcstride;
2527  tmp = tmp_array + 8;
2528  ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2529 #else
2530  const uint8_t *filter = subpel_filters[mx - 1];
2531  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2532  int x, y;
2533  uint8_t tmp_array[152];
2534  uint8_t *tmp = tmp_array;
2535 
2536  src -= srcstride;
2537 
2538  for (y = 0; y < h + 3; y++) {
2539  for (x = 0; x < 8; x++)
2540  tmp[x] = FILTER_4TAP(src, filter, 1);
2541  tmp += 8;
2542  src += srcstride;
2543  }
2544 
2545  tmp = tmp_array + 8;
2546  filter = subpel_filters[my - 1];
2547 
2548  for (y = 0; y < h; y++) {
2549  for (x = 0; x < 8; x++)
2550  dst[x] = FILTER_4TAP(tmp, filter, 8);
2551  dst += dststride;
2552  tmp += 8;
2553  }
2554 #endif
2555 }
2556 
2557 void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2558  ptrdiff_t srcstride, int h, int mx, int my)
2559 {
2560 #if 1
2561  DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2562  uint8_t *tmp = tmp_array;
2563 
2564  src -= srcstride;
2566  tmp = tmp_array + 4;
2567  ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2568 #else
2569  const uint8_t *filter = subpel_filters[mx - 1];
2570  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2571  int x, y;
2572  uint8_t tmp_array[44];
2573  uint8_t *tmp = tmp_array;
2574 
2575  src -= srcstride;
2576 
2577  for (y = 0; y < h + 3; y++) {
2578  for (x = 0; x < 4; x++)
2579  tmp[x] = FILTER_4TAP(src, filter, 1);
2580  tmp += 4;
2581  src += srcstride;
2582  }
2583  tmp = tmp_array + 4;
2584  filter = subpel_filters[my - 1];
2585 
2586  for (y = 0; y < h; y++) {
2587  for (x = 0; x < 4; x++)
2588  dst[x] = FILTER_4TAP(tmp, filter, 4);
2589  dst += dststride;
2590  tmp += 4;
2591  }
2592 #endif
2593 }
2594 
2595 void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2596  ptrdiff_t srcstride, int h, int mx, int my)
2597 {
2598 #if 1
2599  DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2600  uint8_t *tmp = tmp_array;
2601 
2602  src -= 2 * srcstride;
2604  tmp = tmp_array + 32;
2605  ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2606 #else
2607  const uint8_t *filter = subpel_filters[mx - 1];
2608  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2609  int x, y;
2610  uint8_t tmp_array[592];
2611  uint8_t *tmp = tmp_array;
2612 
2613  src -= 2 * srcstride;
2614 
2615  for (y = 0; y < h + 5; y++) {
2616  for (x = 0; x < 16; x++)
2617  tmp[x] = FILTER_4TAP(src, filter, 1);
2618  tmp += 16;
2619  src += srcstride;
2620  }
2621 
2622  tmp = tmp_array + 32;
2623  filter = subpel_filters[my - 1];
2624 
2625  for (y = 0; y < h; y++) {
2626  for (x = 0; x < 16; x++)
2627  dst[x] = FILTER_6TAP(tmp, filter, 16);
2628  dst += dststride;
2629  tmp += 16;
2630  }
2631 #endif
2632 }
2633 
2634 void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2635  ptrdiff_t srcstride, int h, int mx, int my)
2636 {
2637 #if 1
2638  DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2639  uint8_t *tmp = tmp_array;
2640 
2641  src -= 2 * srcstride;
2643  tmp = tmp_array + 16;
2644  ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2645 #else
2646  const uint8_t *filter = subpel_filters[mx - 1];
2647  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2648  int x, y;
2649  uint8_t tmp_array[168];
2650  uint8_t *tmp = tmp_array;
2651 
2652  src -= 2 * srcstride;
2653 
2654  for (y = 0; y < h + 5; y++) {
2655  for (x = 0; x < 8; x++)
2656  tmp[x] = FILTER_4TAP(src, filter, 1);
2657  tmp += 8;
2658  src += srcstride;
2659  }
2660 
2661  tmp = tmp_array + 16;
2662  filter = subpel_filters[my - 1];
2663 
2664  for (y = 0; y < h; y++) {
2665  for (x = 0; x < 8; x++)
2666  dst[x] = FILTER_6TAP(tmp, filter, 8);
2667  dst += dststride;
2668  tmp += 8;
2669  }
2670 #endif
2671 }
2672 
2673 void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2674  ptrdiff_t srcstride, int h, int mx, int my)
2675 {
2676 #if 1
2677  DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2678  uint8_t *tmp = tmp_array;
2679 
2680  src -= 2 * srcstride;
2682  tmp = tmp_array + 8;
2683  ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2684 #else
2685  const uint8_t *filter = subpel_filters[mx - 1];
2686  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2687  int x, y;
2688  uint8_t tmp_array[52];
2689  uint8_t *tmp = tmp_array;
2690 
2691  src -= 2 * srcstride;
2692 
2693  for (y = 0; y < h + 5; y++) {
2694  for (x = 0; x < 4; x++)
2695  tmp[x] = FILTER_4TAP(src, filter, 1);
2696  tmp += 4;
2697  src += srcstride;
2698  }
2699 
2700  tmp = tmp_array + 8;
2701  filter = subpel_filters[my - 1];
2702 
2703  for (y = 0; y < h; y++) {
2704  for (x = 0; x < 4; x++)
2705  dst[x] = FILTER_6TAP(tmp, filter, 4);
2706  dst += dststride;
2707  tmp += 4;
2708  }
2709 #endif
2710 }
2711 
2712 void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2713  ptrdiff_t srcstride, int h, int mx, int my)
2714 {
2715 #if 1
2716  DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2717  uint8_t *tmp = tmp_array;
2718 
2719  src -= srcstride;
2721  tmp = tmp_array + 16;
2722  ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2723 #else
2724  const uint8_t *filter = subpel_filters[mx - 1];
2725  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2726  int x, y;
2727  uint8_t tmp_array[560];
2728  uint8_t *tmp = tmp_array;
2729 
2730  src -= srcstride;
2731 
2732  for (y = 0; y < h + 3; y++) {
2733  for (x = 0; x < 16; x++)
2734  tmp[x] = FILTER_6TAP(src, filter, 1);
2735  tmp += 16;
2736  src += srcstride;
2737  }
2738 
2739  tmp = tmp_array + 16;
2740  filter = subpel_filters[my - 1];
2741 
2742  for (y = 0; y < h; y++) {
2743  for (x = 0; x < 16; x++)
2744  dst[x] = FILTER_4TAP(tmp, filter, 16);
2745  dst += dststride;
2746  tmp += 16;
2747  }
2748 #endif
2749 }
2750 
2751 void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2752  ptrdiff_t srcstride, int h, int mx, int my)
2753 {
2754 #if 1
2755  DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2756  uint8_t *tmp = tmp_array;
2757 
2758  src -= srcstride;
2760  tmp = tmp_array + 8;
2761  ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2762 #else
2763  const uint8_t *filter = subpel_filters[mx - 1];
2764  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2765  int x, y;
2766  uint8_t tmp_array[152];
2767  uint8_t *tmp = tmp_array;
2768 
2769  src -= srcstride;
2770 
2771  for (y = 0; y < h + 3; y++) {
2772  for (x = 0; x < 8; x++)
2773  tmp[x] = FILTER_6TAP(src, filter, 1);
2774  tmp += 8;
2775  src += srcstride;
2776  }
2777 
2778  tmp = tmp_array + 8;
2779  filter = subpel_filters[my - 1];
2780 
2781  for (y = 0; y < h; y++) {
2782  for (x = 0; x < 8; x++)
2783  dst[x] = FILTER_4TAP(tmp, filter, 8);
2784  dst += dststride;
2785  tmp += 8;
2786  }
2787 #endif
2788 }
2789 
2790 void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2791  ptrdiff_t srcstride, int h, int mx, int my)
2792 {
2793 #if 1
2794  DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2795  uint8_t *tmp = tmp_array;
2796 
2797  src -= srcstride;
2799  tmp = tmp_array + 4;
2800  ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2801 #else
2802  const uint8_t *filter = subpel_filters[mx - 1];
2803  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2804  int x, y;
2805  uint8_t tmp_array[44];
2806  uint8_t *tmp = tmp_array;
2807 
2808  src -= srcstride;
2809 
2810  for (y = 0; y < h + 3; y++) {
2811  for (x = 0; x < 4; x++)
2812  tmp[x] = FILTER_6TAP(src, filter, 1);
2813  tmp += 4;
2814  src += srcstride;
2815  }
2816 
2817  tmp = tmp_array + 4;
2818  filter = subpel_filters[my - 1];
2819 
2820  for (y = 0; y < h; y++) {
2821  for (x = 0; x < 4; x++)
2822  dst[x] = FILTER_4TAP(tmp, filter, 4);
2823  dst += dststride;
2824  tmp += 4;
2825  }
2826 #endif
2827 }
2828 
2829 void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2830  ptrdiff_t srcstride, int h, int mx, int my)
2831 {
2832 #if 1
2833  DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2834  uint8_t *tmp = tmp_array;
2835 
2836  src -= 2 * srcstride;
2838  tmp = tmp_array + 32;
2839  ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2840 #else
2841  const uint8_t *filter = subpel_filters[mx - 1];
2842  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2843  int x, y;
2844  uint8_t tmp_array[592];
2845  uint8_t *tmp = tmp_array;
2846 
2847  src -= 2 * srcstride;
2848 
2849  for (y = 0; y < h + 5; y++) {
2850  for (x = 0; x < 16; x++)
2851  tmp[x] = FILTER_6TAP(src, filter, 1);
2852  tmp += 16;
2853  src += srcstride;
2854  }
2855 
2856  tmp = tmp_array + 32;
2857  filter = subpel_filters[my - 1];
2858 
2859  for (y = 0; y < h; y++) {
2860  for (x = 0; x < 16; x++)
2861  dst[x] = FILTER_6TAP(tmp, filter, 16);
2862  dst += dststride;
2863  tmp += 16;
2864  }
2865 #endif
2866 }
2867 
2868 void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2869  ptrdiff_t srcstride, int h, int mx, int my)
2870 {
2871 #if 1
2872  DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2873  uint8_t *tmp = tmp_array;
2874 
2875  src -= 2 * srcstride;
2877  tmp = tmp_array + 16;
2878  ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2879 #else
2880  const uint8_t *filter = subpel_filters[mx - 1];
2881  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2882  int x, y;
2883  uint8_t tmp_array[168];
2884  uint8_t *tmp = tmp_array;
2885 
2886  src -= 2 * srcstride;
2887 
2888  for (y = 0; y < h + 5; y++) {
2889  for (x = 0; x < 8; x++)
2890  tmp[x] = FILTER_6TAP(src, filter, 1);
2891  tmp += 8;
2892  src += srcstride;
2893  }
2894 
2895  tmp = tmp_array + 16;
2896  filter = subpel_filters[my - 1];
2897 
2898  for (y = 0; y < h; y++) {
2899  for (x = 0; x < 8; x++)
2900  dst[x] = FILTER_6TAP(tmp, filter, 8);
2901  dst += dststride;
2902  tmp += 8;
2903  }
2904 #endif
2905 }
2906 
2907 void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2908  ptrdiff_t srcstride, int h, int mx, int my)
2909 {
2910 #if 1
2911  DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2912  uint8_t *tmp = tmp_array;
2913 
2914  src -= 2 * srcstride;
2916  tmp = tmp_array + 8;
2917  ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2918 #else
2919  const uint8_t *filter = subpel_filters[mx - 1];
2920  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2921  int x, y;
2922  uint8_t tmp_array[52];
2923  uint8_t *tmp = tmp_array;
2924 
2925  src -= 2 * srcstride;
2926 
2927  for (y = 0; y < h + 5; y++) {
2928  for (x = 0; x < 4; x++)
2929  tmp[x] = FILTER_6TAP(src, filter, 1);
2930  tmp += 4;
2931  src += srcstride;
2932  }
2933 
2934  tmp = tmp_array + 8;
2935  filter = subpel_filters[my - 1];
2936 
2937  for (y = 0; y < h; y++) {
2938  for (x = 0; x < 4; x++)
2939  dst[x] = FILTER_6TAP(tmp, filter, 4);
2940  dst += dststride;
2941  tmp += 4;
2942  }
2943 #endif
2944 }
2945 
2946 void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2947  ptrdiff_t sstride, int h, int mx, int my)
2948 {
2949 #if 1
2950  union mmi_intfloat64 a, b;
2951  double ftmp[7];
2952  uint32_t tmp[1];
2953  mips_reg dst0, src0;
2955  a.i = 8 - mx;
2956  b.i = mx;
2957 
2958  /*
2959  dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
2960  dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
2961  dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
2962  dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
2963  dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
2964  dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
2965  dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
2966  dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
2967 
2968  dst[ 8] = (a * src[ 8] + b * src[ 9] + 4) >> 3;
2969  dst[ 9] = (a * src[ 9] + b * src[10] + 4) >> 3;
2970  dst[10] = (a * src[10] + b * src[11] + 4) >> 3;
2971  dst[11] = (a * src[11] + b * src[12] + 4) >> 3;
2972  dst[12] = (a * src[12] + b * src[13] + 4) >> 3;
2973  dst[13] = (a * src[13] + b * src[14] + 4) >> 3;
2974  dst[14] = (a * src[14] + b * src[15] + 4) >> 3;
2975  dst[15] = (a * src[15] + b * src[16] + 4) >> 3;
2976  */
2977  __asm__ volatile (
2978  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2979  "li %[tmp0], 0x03 \n\t"
2980  "mtc1 %[tmp0], %[ftmp4] \n\t"
2981  "pshufh %[a], %[a], %[ftmp0] \n\t"
2982  "pshufh %[b], %[b], %[ftmp0] \n\t"
2983 
2984  "1: \n\t"
2985  // 0 - 7
2987  PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2988  PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2989  // 8 - 15
2990  PUT_VP8_BILINEAR8_H_MMI(%[src0], %[dst0])
2991 
2992  "addiu %[h], %[h], -0x01 \n\t"
2993  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
2994  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
2995  "bnez %[h], 1b \n\t"
2996  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2997  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2998  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2999  [ftmp6]"=&f"(ftmp[6]),
3000  [tmp0]"=&r"(tmp[0]),
3002  [dst0]"=&r"(dst0), [src0]"=&r"(src0),
3003  [h]"+&r"(h),
3004  [dst]"+&r"(dst), [src]"+&r"(src),
3005  [a]"+&f"(a.f), [b]"+&f"(b.f)
3006  : [sstride]"r"((mips_reg)sstride),
3007  [dstride]"r"((mips_reg)dstride),
3008  [ff_pw_4]"f"(ff_pw_4.f)
3009  : "memory"
3010  );
3011 #else
3012  int a = 8 - mx, b = mx;
3013  int x, y;
3014 
3015  for (y = 0; y < h; y++) {
3016  for (x = 0; x < 16; x++)
3017  dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3018  dst += dstride;
3019  src += sstride;
3020  }
3021 #endif
3022 }
3023 
3024 void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3025  ptrdiff_t sstride, int h, int mx, int my)
3026 {
3027 #if 1
3028  union mmi_intfloat64 c, d;
3029  double ftmp[7];
3030  uint32_t tmp[1];
3031  mips_reg src0, src1, dst0;
3033  c.i = 8 - my;
3034  d.i = my;
3035 
3036  /*
3037  dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
3038  dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3039  dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3040  dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3041  dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
3042  dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
3043  dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
3044  dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
3045  */
3046  __asm__ volatile (
3047  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3048  "li %[tmp0], 0x03 \n\t"
3049  "mtc1 %[tmp0], %[ftmp4] \n\t"
3050  "pshufh %[c], %[c], %[ftmp0] \n\t"
3051  "pshufh %[d], %[d], %[ftmp0] \n\t"
3052 
3053  "1: \n\t"
3054  // 0 - 7
3055  PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
3056  PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
3057  PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
3058  // 8 - 15
3059  PUT_VP8_BILINEAR8_V_MMI(%[src0], %[src1], %[dst0], %[sstride])
3060 
3061  "addiu %[h], %[h], -0x01 \n\t"
3062  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3063  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3064  "bnez %[h], 1b \n\t"
3065  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3066  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3067  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
3068  [ftmp6]"=&f"(ftmp[6]),
3069  [tmp0]"=&r"(tmp[0]),
3071  [src0]"=&r"(src0), [dst0]"=&r"(dst0),
3072  [src1]"=&r"(src1),
3073  [h]"+&r"(h),
3074  [dst]"+&r"(dst), [src]"+&r"(src),
3075  [c]"+&f"(c.f), [d]"+&f"(d.f)
3076  : [sstride]"r"((mips_reg)sstride),
3077  [dstride]"r"((mips_reg)dstride),
3078  [ff_pw_4]"f"(ff_pw_4.f)
3079  : "memory"
3080  );
3081 #else
3082  int c = 8 - my, d = my;
3083  int x, y;
3084 
3085  for (y = 0; y < h; y++) {
3086  for (x = 0; x < 16; x++)
3087  dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3088  dst += dstride;
3089  src += sstride;
3090  }
3091 #endif
3092 }
3093 
3094 void ff_put_vp8_bilinear16_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3095  ptrdiff_t sstride, int h, int mx, int my)
3096 {
3097 #if 1
3098  DECLARE_ALIGNED(8, uint8_t, tmp_array[528]);
3099  uint8_t *tmp = tmp_array;
3100 
3101  ff_put_vp8_bilinear16_h_mmi(tmp, 16, src, sstride, h + 1, mx, my);
3102  ff_put_vp8_bilinear16_v_mmi(dst, dstride, tmp, 16, h, mx, my);
3103 #else
3104  int a = 8 - mx, b = mx;
3105  int c = 8 - my, d = my;
3106  int x, y;
3107  uint8_t tmp_array[528];
3108  uint8_t *tmp = tmp_array;
3109 
3110  for (y = 0; y < h + 1; y++) {
3111  for (x = 0; x < 16; x++)
3112  tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3113  tmp += 16;
3114  src += sstride;
3115  }
3116 
3117  tmp = tmp_array;
3118 
3119  for (y = 0; y < h; y++) {
3120  for (x = 0; x < 16; x++)
3121  dst[x] = (c * tmp[x] + d * tmp[x + 16] + 4) >> 3;
3122  dst += dstride;
3123  tmp += 16;
3124  }
3125 #endif
3126 }
3127 
3128 void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3129  ptrdiff_t sstride, int h, int mx, int my)
3130 {
3131 #if 1
3132  union mmi_intfloat64 a, b;
3133  double ftmp[7];
3134  uint32_t tmp[1];
3136  a.i = 8 - mx;
3137  b.i = mx;
3138 
3139  /*
3140  dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3141  dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3142  dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3143  dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3144  dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
3145  dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
3146  dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
3147  dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
3148  */
3149  __asm__ volatile (
3150  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3151  "li %[tmp0], 0x03 \n\t"
3152  "mtc1 %[tmp0], %[ftmp4] \n\t"
3153  "pshufh %[a], %[a], %[ftmp0] \n\t"
3154  "pshufh %[b], %[b], %[ftmp0] \n\t"
3155 
3156  "1: \n\t"
3158 
3159  "addiu %[h], %[h], -0x01 \n\t"
3160  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3161  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3162  "bnez %[h], 1b \n\t"
3163  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3164  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3165  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
3166  [ftmp6]"=&f"(ftmp[6]),
3167  [tmp0]"=&r"(tmp[0]),
3169  [h]"+&r"(h),
3170  [dst]"+&r"(dst), [src]"+&r"(src),
3171  [a]"+&f"(a.f), [b]"+&f"(b.f)
3172  : [sstride]"r"((mips_reg)sstride),
3173  [dstride]"r"((mips_reg)dstride),
3174  [ff_pw_4]"f"(ff_pw_4.f)
3175  : "memory"
3176  );
3177 #else
3178  int a = 8 - mx, b = mx;
3179  int x, y;
3180 
3181  for (y = 0; y < h; y++) {
3182  for (x = 0; x < 8; x++)
3183  dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3184  dst += dstride;
3185  src += sstride;
3186  }
3187 #endif
3188 }
3189 
3190 void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3191  ptrdiff_t sstride, int h, int mx, int my)
3192 {
3193 #if 1
3194  union mmi_intfloat64 c, d;
3195  double ftmp[7];
3196  uint32_t tmp[1];
3197  mips_reg src1;
3199  c.i = 8 - my;
3200  d.i = my;
3201 
3202  /*
3203  dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
3204  dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3205  dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3206  dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3207  dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
3208  dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
3209  dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
3210  dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
3211  */
3212  __asm__ volatile (
3213  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3214  "li %[tmp0], 0x03 \n\t"
3215  "mtc1 %[tmp0], %[ftmp4] \n\t"
3216  "pshufh %[c], %[c], %[ftmp0] \n\t"
3217  "pshufh %[d], %[d], %[ftmp0] \n\t"
3218 
3219  "1: \n\t"
3220  PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
3221 
3222  "addiu %[h], %[h], -0x01 \n\t"
3223  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3224  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3225  "bnez %[h], 1b \n\t"
3226  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3227  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3228  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
3229  [ftmp6]"=&f"(ftmp[6]),
3230  [tmp0]"=&r"(tmp[0]),
3232  [src1]"=&r"(src1),
3233  [h]"+&r"(h),
3234  [dst]"+&r"(dst), [src]"+&r"(src),
3235  [c]"+&f"(c.f), [d]"+&f"(d.f)
3236  : [sstride]"r"((mips_reg)sstride),
3237  [dstride]"r"((mips_reg)dstride),
3238  [ff_pw_4]"f"(ff_pw_4.f)
3239  : "memory"
3240  );
3241 #else
3242  int c = 8 - my, d = my;
3243  int x, y;
3244 
3245  for (y = 0; y < h; y++) {
3246  for (x = 0; x < 8; x++)
3247  dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3248  dst += dstride;
3249  src += sstride;
3250  }
3251 #endif
3252 }
3253 
3254 void ff_put_vp8_bilinear8_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3255  ptrdiff_t sstride, int h, int mx, int my)
3256 {
3257 #if 1
3258  DECLARE_ALIGNED(8, uint8_t, tmp_array[136]);
3259  uint8_t *tmp = tmp_array;
3260 
3261  ff_put_vp8_bilinear8_h_mmi(tmp, 8, src, sstride, h + 1, mx, my);
3262  ff_put_vp8_bilinear8_v_mmi(dst, dstride, tmp, 8, h, mx, my);
3263 #else
3264  int a = 8 - mx, b = mx;
3265  int c = 8 - my, d = my;
3266  int x, y;
3267  uint8_t tmp_array[136];
3268  uint8_t *tmp = tmp_array;
3269 
3270  for (y = 0; y < h + 1; y++) {
3271  for (x = 0; x < 8; x++)
3272  tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3273  tmp += 8;
3274  src += sstride;
3275  }
3276 
3277  tmp = tmp_array;
3278 
3279  for (y = 0; y < h; y++) {
3280  for (x = 0; x < 8; x++)
3281  dst[x] = (c * tmp[x] + d * tmp[x + 8] + 4) >> 3;
3282  dst += dstride;
3283  tmp += 8;
3284  }
3285 #endif
3286 }
3287 
3288 void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3289  ptrdiff_t sstride, int h, int mx, int my)
3290 {
3291 #if 1
3292  union mmi_intfloat64 a, b;
3293  double ftmp[5];
3294  uint32_t tmp[1];
3297  a.i = 8 - mx;
3298  b.i = mx;
3299 
3300  /*
3301  dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3302  dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3303  dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3304  dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3305  */
3306  __asm__ volatile (
3307  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3308  "li %[tmp0], 0x03 \n\t"
3309  "mtc1 %[tmp0], %[ftmp4] \n\t"
3310  "pshufh %[a], %[a], %[ftmp0] \n\t"
3311  "pshufh %[b], %[b], %[ftmp0] \n\t"
3312 
3313  "1: \n\t"
3315 
3316  "addiu %[h], %[h], -0x01 \n\t"
3317  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3318  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3319  "bnez %[h], 1b \n\t"
3320  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3321  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3322  [ftmp4]"=&f"(ftmp[4]),
3323  [tmp0]"=&r"(tmp[0]),
3326  [h]"+&r"(h),
3327  [dst]"+&r"(dst), [src]"+&r"(src),
3328  [a]"+&f"(a.f), [b]"+&f"(b.f)
3329  : [sstride]"r"((mips_reg)sstride),
3330  [dstride]"r"((mips_reg)dstride),
3331  [ff_pw_4]"f"(ff_pw_4.f)
3332  : "memory"
3333  );
3334 #else
3335  int a = 8 - mx, b = mx;
3336  int x, y;
3337 
3338  for (y = 0; y < h; y++) {
3339  for (x = 0; x < 4; x++)
3340  dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3341  dst += dstride;
3342  src += sstride;
3343  }
3344 #endif
3345 }
3346 
3347 void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3348  ptrdiff_t sstride, int h, int mx, int my)
3349 {
3350 #if 1
3351  union mmi_intfloat64 c, d;
3352  double ftmp[7];
3353  uint32_t tmp[1];
3354  mips_reg src1;
3357  c.i = 8 - my;
3358  d.i = my;
3359 
3360  /*
3361  dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
3362  dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3363  dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3364  dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3365  */
3366  __asm__ volatile (
3367  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3368  "li %[tmp0], 0x03 \n\t"
3369  "mtc1 %[tmp0], %[ftmp4] \n\t"
3370  "pshufh %[c], %[c], %[ftmp0] \n\t"
3371  "pshufh %[d], %[d], %[ftmp0] \n\t"
3372 
3373  "1: \n\t"
3374  PUT_VP8_BILINEAR4_V_MMI(%[src], %[src1], %[dst], %[sstride])
3375 
3376  "addiu %[h], %[h], -0x01 \n\t"
3377  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3378  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3379  "bnez %[h], 1b \n\t"
3380  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3381  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3382  [ftmp4]"=&f"(ftmp[4]),
3383  [tmp0]"=&r"(tmp[0]),
3386  [src1]"=&r"(src1),
3387  [h]"+&r"(h),
3388  [dst]"+&r"(dst), [src]"+&r"(src),
3389  [c]"+&f"(c.f), [d]"+&f"(d.f)
3390  : [sstride]"r"((mips_reg)sstride),
3391  [dstride]"r"((mips_reg)dstride),
3392  [ff_pw_4]"f"(ff_pw_4.f)
3393  : "memory"
3394  );
3395 #else
3396  int c = 8 - my, d = my;
3397  int x, y;
3398 
3399  for (y = 0; y < h; y++) {
3400  for (x = 0; x < 4; x++)
3401  dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3402  dst += dstride;
3403  src += sstride;
3404  }
3405 #endif
3406 }
3407 
3408 void ff_put_vp8_bilinear4_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3409  ptrdiff_t sstride, int h, int mx, int my)
3410 {
3411 #if 1
3412  DECLARE_ALIGNED(4, uint8_t, tmp_array[36]);
3413  uint8_t *tmp = tmp_array;
3414 
3415  ff_put_vp8_bilinear4_h_mmi(tmp, 4, src, sstride, h + 1, mx, my);
3416  ff_put_vp8_bilinear4_v_mmi(dst, dstride, tmp, 4, h, mx, my);
3417 #else
3418  int a = 8 - mx, b = mx;
3419  int c = 8 - my, d = my;
3420  int x, y;
3421  uint8_t tmp_array[36];
3422  uint8_t *tmp = tmp_array;
3423 
3424  for (y = 0; y < h + 1; y++) {
3425  for (x = 0; x < 4; x++)
3426  tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3427  tmp += 4;
3428  src += sstride;
3429  }
3430 
3431  tmp = tmp_array;
3432 
3433  for (y = 0; y < h; y++) {
3434  for (x = 0; x < 4; x++)
3435  dst[x] = (c * tmp[x] + d * tmp[x + 4] + 4) >> 3;
3436  dst += dstride;
3437  tmp += 4;
3438  }
3439 #endif
3440 }
DECLARE_UINT32_T
#define DECLARE_UINT32_T
Definition: vp8dsp_mmi.c:33
PUT_VP8_EPEL4_V6_MMI
#define PUT_VP8_EPEL4_V6_MMI(src, src1, dst, srcstride)
Definition: vp8dsp_mmi.c:259
q1
static const uint8_t q1[256]
Definition: twofish.c:100
mem_internal.h
ff_put_vp8_epel4_h4_mmi
void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1725
FILTER_4TAP
#define FILTER_4TAP(src, F, stride)
Definition: vp8dsp.c:488
vp8_filter_common_isnot4tap
static av_always_inline void vp8_filter_common_isnot4tap(uint8_t *p, ptrdiff_t stride)
Definition: vp8dsp_mmi.c:694
ff_vp8_h_loop_filter16_mmi
void ff_vp8_h_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1358
filter1
static void filter1(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:360
src1
const pixel * src1
Definition: h264pred_template.c:420
ff_pw_4
const union av_intfloat64 ff_pw_4
Definition: constants.c:28
ff_vp8_v_loop_filter_simple_mmi
void ff_vp8_v_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
Definition: vp8dsp_mmi.c:1425
DECLARE_VAR_LOW32
#define DECLARE_VAR_LOW32
Definition: mmiutils.h:37
av_unused
#define av_unused
Definition: attributes.h:131
mask
int mask
Definition: mediacodecdec_common.c:154
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
w
uint8_t w
Definition: llviddspenc.c:38
b
#define b
Definition: input.c:42
ff_put_vp8_pixels16_mmi
void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int x, int y)
Definition: vp8dsp_mmi.c:1443
filter
void(* filter)(uint8_t *src, int stride, int qscale)
Definition: h263dsp.c:29
ff_put_vp8_bilinear16_v_mmi
void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3024
mips_reg
#define mips_reg
Definition: asmdefs.h:46
intfloat.h
RESTRICT_ASM_DOUBLE_1
#define RESTRICT_ASM_DOUBLE_1
Definition: vp8dsp_mmi.c:34
ff_crop_tab
#define ff_crop_tab
Definition: motionpixels_tablegen.c:26
ff_vp8_h_loop_filter_simple_mmi
void ff_vp8_h_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
Definition: vp8dsp_mmi.c:1434
PUT_VP8_EPEL8_V4_MMI
#define PUT_VP8_EPEL8_V4_MMI(src, src1, dst, srcstride)
Definition: vp8dsp_mmi.c:493
ff_vp8_luma_dc_wht_mmi
void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16])
Definition: vp8dsp_mmi.c:947
vp8_simple_limit
static av_always_inline int vp8_simple_limit(uint8_t *p, ptrdiff_t stride, int flim)
Definition: vp8dsp_mmi.c:721
ff_pw_64
const union av_intfloat64 ff_pw_64
Definition: constants.c:44
mx
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t mx
Definition: dsp.h:53
PUT_VP8_BILINEAR4_H_MMI
#define PUT_VP8_BILINEAR4_H_MMI(src, dst)
Definition: vp8dsp_mmi.c:561
TRANSPOSE_4H
#define TRANSPOSE_4H(fr_i0, fr_i1, fr_i2, fr_i3, fr_t0, fr_t1, fr_t2, fr_t3)
brief: Transpose 4X4 half word packaged data.
Definition: mmiutils.h:295
ff_put_vp8_bilinear16_h_mmi
void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2946
ff_put_vp8_bilinear8_v_mmi
void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3190
vp8_filter_common_is4tap
static av_always_inline void vp8_filter_common_is4tap(uint8_t *p, ptrdiff_t stride)
Definition: vp8dsp_mmi.c:669
ff_put_vp8_epel8_h4v4_mmi
void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2518
constants.h
val
static double val(void *priv, double ch)
Definition: aeval.c:77
fourtap_subpel_filters
static const uint64_t fourtap_subpel_filters[7][6]
Definition: vp8dsp_mmi.c:621
DECLARE_DOUBLE_2
#define DECLARE_DOUBLE_2
Definition: vp8dsp_mmi.c:32
a2
static double a2(void *priv, double x, double y)
Definition: vf_xfade.c:2030
mmiutils.h
ff_put_vp8_epel16_h6v6_mmi
void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2829
ff_put_vp8_epel8_h4v6_mmi
void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2634
vp8_v_loop_filter8_mmi
static av_always_inline void vp8_v_loop_filter8_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:787
ff_put_vp8_epel4_h4v6_mmi
void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2673
AV_ZERO64
#define AV_ZERO64(d)
Definition: intreadwrite.h:666
ff_put_vp8_epel16_v6_mmi
void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2242
PUT_VP8_BILINEAR4_V_MMI
#define PUT_VP8_BILINEAR4_V_MMI(src, src1, dst, sstride)
Definition: vp8dsp_mmi.c:603
PUT_VP8_BILINEAR8_V_MMI
#define PUT_VP8_BILINEAR8_V_MMI(src, src1, dst, sstride)
Definition: vp8dsp_mmi.c:578
ff_vp8_h_loop_filter16_inner_mmi
void ff_vp8_h_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1396
ff_put_vp8_epel8_v4_mmi
void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2106
ff_vp8_idct_add_mmi
void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
Definition: vp8dsp_mmi.c:1105
av_intfloat64
Definition: intfloat.h:32
ff_put_vp8_epel4_h4v4_mmi
void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2557
q0
static const uint8_t q0[256]
Definition: twofish.c:81
E
#define E
Definition: avdct.c:33
my
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t my
Definition: dsp.h:53
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:74
FILTER_6TAP
#define FILTER_6TAP(src, F, stride)
Definition: vp8dsp.c:483
srcstride
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t const uint8_t ptrdiff_t srcstride
Definition: dsp.h:84
ff_put_vp8_epel4_h6_mmi
void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1952
PUT_VP8_BILINEAR8_H_MMI
#define PUT_VP8_BILINEAR8_H_MMI(src, dst)
Definition: vp8dsp_mmi.c:537
RESTRICT_ASM_UINT32_T
#define RESTRICT_ASM_UINT32_T
Definition: vp8dsp_mmi.c:36
PUT_VP8_EPEL4_V4_MMI
#define PUT_VP8_EPEL4_V4_MMI(src, src1, dst, srcstride)
Definition: vp8dsp_mmi.c:302
ff_put_vp8_bilinear4_h_mmi
void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3288
ff_put_vp8_epel8_h6v6_mmi
void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2868
ff_put_vp8_epel8_h4_mmi
void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1655
MMI_VP8_LOOP_FILTER
#define MMI_VP8_LOOP_FILTER
Definition: vp8dsp_mmi.c:50
av_intfloat64::i
uint64_t i
Definition: intfloat.h:33
ff_put_vp8_epel4_h6v6_mmi
void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2907
vp8_v_loop_filter8_inner_mmi
static av_always_inline void vp8_v_loop_filter8_inner_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:846
ff_put_vp8_epel16_h6_mmi
void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1788
PUT_VP8_EPEL4_H4_MMI
#define PUT_VP8_EPEL4_H4_MMI(src, dst)
Definition: vp8dsp_mmi.c:231
mmi_intfloat64::f
double f
Definition: asmdefs.h:105
ff_vp8_v_loop_filter16_inner_mmi
void ff_vp8_v_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1381
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
RESTRICT_ASM_DOUBLE_2
#define RESTRICT_ASM_DOUBLE_2
Definition: vp8dsp_mmi.c:35
ff_vp8_v_loop_filter8uv_mmi
void ff_vp8_v_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1366
vp8_h_loop_filter8_mmi
static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:861
dc
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff) *mv_scale Intra DC Prediction block[y][x] dc[1]
Definition: snow.txt:400
ff_vp8_idct_dc_add4y_mmi
void ff_vp8_idct_dc_add4y_mmi(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride)
Definition: vp8dsp_mmi.c:1332
ff_put_vp8_epel8_h6v4_mmi
void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2751
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem_internal.h:104
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:83
ff_put_vp8_epel16_v4_mmi
void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2020
ff_put_vp8_bilinear8_hv_mmi
void ff_put_vp8_bilinear8_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3254
ff_put_vp8_bilinear4_v_mmi
void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3347
ff_put_vp8_epel8_h6_mmi
void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1878
ff_vp8_idct_dc_add_mmi
void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
Definition: vp8dsp_mmi.c:1274
ff_vp8_idct_dc_add4uv_mmi
void ff_vp8_idct_dc_add4uv_mmi(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride)
Definition: vp8dsp_mmi.c:1341
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
a0
static double a0(void *priv, double x, double y)
Definition: vf_xfade.c:2028
PTR_SUBU
#define PTR_SUBU
Definition: asmdefs.h:52
DECLARE_VAR_ALL64
#define DECLARE_VAR_ALL64
Definition: mmiutils.h:39
attributes.h
vp8_normal_limit
static av_always_inline int vp8_normal_limit(uint8_t *p, ptrdiff_t stride, int E, int I)
Definition: vp8dsp_mmi.c:769
ff_put_vp8_epel16_h4v4_mmi
void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2479
ff_put_vp8_epel4_v4_mmi
void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2177
ff_put_vp8_bilinear16_hv_mmi
void ff_put_vp8_bilinear16_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3094
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
ff_put_vp8_bilinear8_h_mmi
void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3128
ff_vp8_v_loop_filter8uv_inner_mmi
void ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1411
ff_vp8_v_loop_filter16_mmi
void ff_vp8_v_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1351
ff_put_vp8_epel16_h4_mmi
void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1570
clip_int8
#define clip_int8(n)
Definition: vp8dsp_mmi.c:668
ff_put_vp8_bilinear4_hv_mmi
void ff_put_vp8_bilinear4_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3408
DECLARE_DOUBLE_1
#define DECLARE_DOUBLE_1
Definition: vp8dsp_mmi.c:31
av_always_inline
#define av_always_inline
Definition: attributes.h:49
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
PUT_VP8_EPEL8_V6_MMI
#define PUT_VP8_EPEL8_V6_MMI(src, src1, dst, srcstride)
Definition: vp8dsp_mmi.c:431
ff_put_vp8_epel16_h6v4_mmi
void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2712
PUT_VP8_EPEL8_H4_MMI
#define PUT_VP8_EPEL8_H4_MMI(src, dst)
Definition: vp8dsp_mmi.c:390
av_intfloat64::f
double f
Definition: intfloat.h:34
stride
#define stride
Definition: h264pred_template.c:536
ff_put_vp8_epel8_v6_mmi
void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2333
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
vp8_h_loop_filter8_inner_mmi
static av_always_inline void vp8_h_loop_filter8_inner_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:932
ff_vp8_h_loop_filter8uv_inner_mmi
void ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1418
vp8dsp_mips.h
PTR_ADDU
#define PTR_ADDU
Definition: asmdefs.h:49
ff_vp8_luma_dc_wht_dc_mmi
void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16])
Definition: vp8dsp_mmi.c:1081
RESTRICT_ASM_LOW32
#define RESTRICT_ASM_LOW32
Definition: mmiutils.h:38
ff_put_vp8_epel16_h4v6_mmi
void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2595
ff_put_vp8_pixels4_mmi
void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int x, int y)
Definition: vp8dsp_mmi.c:1530
TRANSPOSE_8B
#define TRANSPOSE_8B(fr_i0, fr_i1, fr_i2, fr_i3, fr_i4, fr_i5, fr_i6, fr_i7, fr_t0, fr_t1, fr_t2, fr_t3)
brief: Transpose 8x8 byte packaged data.
Definition: mmiutils.h:311
PUT_VP8_EPEL4_H6_MMI
#define PUT_VP8_EPEL4_H6_MMI(src, dst)
Definition: vp8dsp_mmi.c:194
cm
#define cm
Definition: dvbsubdec.c:40
PTR_ADDIU
#define PTR_ADDIU
Definition: asmdefs.h:50
ff_put_vp8_epel4_h6v4_mmi
void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2790
av_clip_uint8
#define av_clip_uint8
Definition: common.h:106
src0
const pixel *const src0
Definition: h264pred_template.c:419
filter0
static void filter0(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:352
filter_mbedge
static av_always_inline void filter_mbedge(uint8_t *p, ptrdiff_t stride)
Definition: vp8dsp_mmi.c:742
mmi_intfloat64
Definition: asmdefs.h:103
hev
static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
Definition: vp8dsp_mmi.c:732
ff_put_vp8_epel4_v6_mmi
void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2409
ff_put_vp8_pixels8_mmi
void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int x, int y)
Definition: vp8dsp_mmi.c:1490
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
a1
static double a1(void *priv, double x, double y)
Definition: vf_xfade.c:2029
PUT_VP8_EPEL8_H6_MMI
#define PUT_VP8_EPEL8_H6_MMI(src, dst)
Definition: vp8dsp_mmi.c:333
h
h
Definition: vp9dsp_template.c:2070
mmi_intfloat64::i
int64_t i
Definition: asmdefs.h:104
MAX_NEG_CROP
#define MAX_NEG_CROP
Definition: mathops.h:31
RESTRICT_ASM_ALL64
#define RESTRICT_ASM_ALL64
Definition: mmiutils.h:40
ff_vp8_h_loop_filter8uv_mmi
void ff_vp8_h_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1373
src
#define src
Definition: vp8dsp.c:248
subpel_filters
static const uint8_t subpel_filters[7][6]
Definition: vp8dsp.c:459