FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
h264qpel_template.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavutil/mem.h"
22 
23 #ifdef DEBUG
24 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
25 #else
26 #define ASSERT_ALIGNED(ptr) ;
27 #endif
28 
29 /* this code assume stride % 16 == 0 */
30 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
31 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *dst,
32  const uint8_t *src,
33  int dstStride, int srcStride)
34 {
35  register int i;
36 
37  LOAD_ZERO;
38  const vec_u8 permM2 = vec_lvsl(-2, src);
39  const vec_u8 permM1 = vec_lvsl(-1, src);
40  const vec_u8 permP0 = vec_lvsl(+0, src);
41  const vec_u8 permP1 = vec_lvsl(+1, src);
42  const vec_u8 permP2 = vec_lvsl(+2, src);
43  const vec_u8 permP3 = vec_lvsl(+3, src);
44  const vec_s16 v5ss = vec_splat_s16(5);
45  const vec_u16 v5us = vec_splat_u16(5);
46  const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
47  const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
48 
49  vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
50 
51  register int align = ((((unsigned long)src) - 2) % 16);
52 
53  vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
54  srcP2A, srcP2B, srcP3A, srcP3B,
55  srcM1A, srcM1B, srcM2A, srcM2B,
56  sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
57  pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
58  psumA, psumB, sumA, sumB;
59 
60  vec_u8 sum, fsum;
61 
62  for (i = 0 ; i < 16 ; i ++) {
63  vec_u8 srcR1 = vec_ld(-2, src);
64  vec_u8 srcR2 = vec_ld(14, src);
65 
66  switch (align) {
67  default: {
68  srcM2 = vec_perm(srcR1, srcR2, permM2);
69  srcM1 = vec_perm(srcR1, srcR2, permM1);
70  srcP0 = vec_perm(srcR1, srcR2, permP0);
71  srcP1 = vec_perm(srcR1, srcR2, permP1);
72  srcP2 = vec_perm(srcR1, srcR2, permP2);
73  srcP3 = vec_perm(srcR1, srcR2, permP3);
74  } break;
75  case 11: {
76  srcM2 = vec_perm(srcR1, srcR2, permM2);
77  srcM1 = vec_perm(srcR1, srcR2, permM1);
78  srcP0 = vec_perm(srcR1, srcR2, permP0);
79  srcP1 = vec_perm(srcR1, srcR2, permP1);
80  srcP2 = vec_perm(srcR1, srcR2, permP2);
81  srcP3 = srcR2;
82  } break;
83  case 12: {
84  vec_u8 srcR3 = vec_ld(30, src);
85  srcM2 = vec_perm(srcR1, srcR2, permM2);
86  srcM1 = vec_perm(srcR1, srcR2, permM1);
87  srcP0 = vec_perm(srcR1, srcR2, permP0);
88  srcP1 = vec_perm(srcR1, srcR2, permP1);
89  srcP2 = srcR2;
90  srcP3 = vec_perm(srcR2, srcR3, permP3);
91  } break;
92  case 13: {
93  vec_u8 srcR3 = vec_ld(30, src);
94  srcM2 = vec_perm(srcR1, srcR2, permM2);
95  srcM1 = vec_perm(srcR1, srcR2, permM1);
96  srcP0 = vec_perm(srcR1, srcR2, permP0);
97  srcP1 = srcR2;
98  srcP2 = vec_perm(srcR2, srcR3, permP2);
99  srcP3 = vec_perm(srcR2, srcR3, permP3);
100  } break;
101  case 14: {
102  vec_u8 srcR3 = vec_ld(30, src);
103  srcM2 = vec_perm(srcR1, srcR2, permM2);
104  srcM1 = vec_perm(srcR1, srcR2, permM1);
105  srcP0 = srcR2;
106  srcP1 = vec_perm(srcR2, srcR3, permP1);
107  srcP2 = vec_perm(srcR2, srcR3, permP2);
108  srcP3 = vec_perm(srcR2, srcR3, permP3);
109  } break;
110  case 15: {
111  vec_u8 srcR3 = vec_ld(30, src);
112  srcM2 = vec_perm(srcR1, srcR2, permM2);
113  srcM1 = srcR2;
114  srcP0 = vec_perm(srcR2, srcR3, permP0);
115  srcP1 = vec_perm(srcR2, srcR3, permP1);
116  srcP2 = vec_perm(srcR2, srcR3, permP2);
117  srcP3 = vec_perm(srcR2, srcR3, permP3);
118  } break;
119  }
120 
121  srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
122  srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
123  srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
124  srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
125 
126  srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
127  srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
128  srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
129  srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
130 
131  srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
132  srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
133  srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
134  srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
135 
136  sum1A = vec_adds(srcP0A, srcP1A);
137  sum1B = vec_adds(srcP0B, srcP1B);
138  sum2A = vec_adds(srcM1A, srcP2A);
139  sum2B = vec_adds(srcM1B, srcP2B);
140  sum3A = vec_adds(srcM2A, srcP3A);
141  sum3B = vec_adds(srcM2B, srcP3B);
142 
143  pp1A = vec_mladd(sum1A, v20ss, v16ss);
144  pp1B = vec_mladd(sum1B, v20ss, v16ss);
145 
146  pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
147  pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
148 
149  pp3A = vec_add(sum3A, pp1A);
150  pp3B = vec_add(sum3B, pp1B);
151 
152  psumA = vec_sub(pp3A, pp2A);
153  psumB = vec_sub(pp3B, pp2B);
154 
155  sumA = vec_sra(psumA, v5us);
156  sumB = vec_sra(psumB, v5us);
157 
158  sum = vec_packsu(sumA, sumB);
159 
160  ASSERT_ALIGNED(dst);
161 
162  OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
163 
164  vec_st(fsum, 0, dst);
165 
166  src += srcStride;
167  dst += dstStride;
168  }
169 }
170 #endif
171 
172 /* this code assume stride % 16 == 0 */
173 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
174 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst,
175  const uint8_t *src,
176  int dstStride, int srcStride)
177 {
178  register int i;
179 
180  LOAD_ZERO;
181  const vec_u8 perm = vec_lvsl(0, src);
182  const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
183  const vec_u16 v5us = vec_splat_u16(5);
184  const vec_s16 v5ss = vec_splat_s16(5);
185  const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
186 
187  const uint8_t *srcbis = src - (srcStride * 2);
188 
189  const vec_u8 srcM2a = vec_ld(0, srcbis);
190  const vec_u8 srcM2b = vec_ld(16, srcbis);
191  const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
192  //srcbis += srcStride;
193  const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
194  const vec_u8 srcM1b = vec_ld(16, srcbis);
195  const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
196  //srcbis += srcStride;
197  const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
198  const vec_u8 srcP0b = vec_ld(16, srcbis);
199  const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
200  //srcbis += srcStride;
201  const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
202  const vec_u8 srcP1b = vec_ld(16, srcbis);
203  const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
204  //srcbis += srcStride;
205  const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
206  const vec_u8 srcP2b = vec_ld(16, srcbis);
207  const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
208  //srcbis += srcStride;
209 
210  vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
211  vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
212  vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
213  vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
214  vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
215  vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
216  vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
217  vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
218  vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
219  vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
220 
221  vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
222  psumA, psumB, sumA, sumB,
223  srcP3ssA, srcP3ssB,
224  sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
225 
226  vec_u8 sum, fsum, srcP3a, srcP3b, srcP3;
227 
228  for (i = 0 ; i < 16 ; i++) {
229  srcP3a = vec_ld(0, srcbis += srcStride);
230  srcP3b = vec_ld(16, srcbis);
231  srcP3 = vec_perm(srcP3a, srcP3b, perm);
232  srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
233  srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
234  //srcbis += srcStride;
235 
236  sum1A = vec_adds(srcP0ssA, srcP1ssA);
237  sum1B = vec_adds(srcP0ssB, srcP1ssB);
238  sum2A = vec_adds(srcM1ssA, srcP2ssA);
239  sum2B = vec_adds(srcM1ssB, srcP2ssB);
240  sum3A = vec_adds(srcM2ssA, srcP3ssA);
241  sum3B = vec_adds(srcM2ssB, srcP3ssB);
242 
243  srcM2ssA = srcM1ssA;
244  srcM2ssB = srcM1ssB;
245  srcM1ssA = srcP0ssA;
246  srcM1ssB = srcP0ssB;
247  srcP0ssA = srcP1ssA;
248  srcP0ssB = srcP1ssB;
249  srcP1ssA = srcP2ssA;
250  srcP1ssB = srcP2ssB;
251  srcP2ssA = srcP3ssA;
252  srcP2ssB = srcP3ssB;
253 
254  pp1A = vec_mladd(sum1A, v20ss, v16ss);
255  pp1B = vec_mladd(sum1B, v20ss, v16ss);
256 
257  pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
258  pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
259 
260  pp3A = vec_add(sum3A, pp1A);
261  pp3B = vec_add(sum3B, pp1B);
262 
263  psumA = vec_sub(pp3A, pp2A);
264  psumB = vec_sub(pp3B, pp2B);
265 
266  sumA = vec_sra(psumA, v5us);
267  sumB = vec_sra(psumB, v5us);
268 
269  sum = vec_packsu(sumA, sumB);
270 
271  ASSERT_ALIGNED(dst);
272 
273  OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
274 
275  vec_st(fsum, 0, dst);
276 
277  dst += dstStride;
278  }
279 }
280 #endif
281 
282 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
283 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
284 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp,
285  const uint8_t *src,
286  int dstStride, int tmpStride,
287  int srcStride)
288 {
289  register int i;
290  LOAD_ZERO;
291  const vec_u8 permM2 = vec_lvsl(-2, src);
292  const vec_u8 permM1 = vec_lvsl(-1, src);
293  const vec_u8 permP0 = vec_lvsl(+0, src);
294  const vec_u8 permP1 = vec_lvsl(+1, src);
295  const vec_u8 permP2 = vec_lvsl(+2, src);
296  const vec_u8 permP3 = vec_lvsl(+3, src);
297  const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
298  const vec_u32 v10ui = vec_splat_u32(10);
299  const vec_s16 v5ss = vec_splat_s16(5);
300  const vec_s16 v1ss = vec_splat_s16(1);
301  const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
302  const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
303 
304  register int align = ((((unsigned long)src) - 2) % 16);
305 
306  vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
307  srcP2A, srcP2B, srcP3A, srcP3B,
308  srcM1A, srcM1B, srcM2A, srcM2B,
309  sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
310  pp1A, pp1B, pp2A, pp2B, psumA, psumB;
311 
312  const vec_u8 mperm = (const vec_u8)
313  {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
314  0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
315  int16_t *tmpbis = tmp;
316 
317  vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
318  tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
319  tmpP2ssA, tmpP2ssB;
320 
321  vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
322  pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
323  pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
324  ssumAe, ssumAo, ssumBe, ssumBo;
325  vec_u8 fsum, sumv, sum;
326  vec_s16 ssume, ssumo;
327 
328  src -= (2 * srcStride);
329  for (i = 0 ; i < 21 ; i ++) {
330  vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
331  vec_u8 srcR1 = vec_ld(-2, src);
332  vec_u8 srcR2 = vec_ld(14, src);
333 
334  switch (align) {
335  default: {
336  srcM2 = vec_perm(srcR1, srcR2, permM2);
337  srcM1 = vec_perm(srcR1, srcR2, permM1);
338  srcP0 = vec_perm(srcR1, srcR2, permP0);
339  srcP1 = vec_perm(srcR1, srcR2, permP1);
340  srcP2 = vec_perm(srcR1, srcR2, permP2);
341  srcP3 = vec_perm(srcR1, srcR2, permP3);
342  } break;
343  case 11: {
344  srcM2 = vec_perm(srcR1, srcR2, permM2);
345  srcM1 = vec_perm(srcR1, srcR2, permM1);
346  srcP0 = vec_perm(srcR1, srcR2, permP0);
347  srcP1 = vec_perm(srcR1, srcR2, permP1);
348  srcP2 = vec_perm(srcR1, srcR2, permP2);
349  srcP3 = srcR2;
350  } break;
351  case 12: {
352  vec_u8 srcR3 = vec_ld(30, src);
353  srcM2 = vec_perm(srcR1, srcR2, permM2);
354  srcM1 = vec_perm(srcR1, srcR2, permM1);
355  srcP0 = vec_perm(srcR1, srcR2, permP0);
356  srcP1 = vec_perm(srcR1, srcR2, permP1);
357  srcP2 = srcR2;
358  srcP3 = vec_perm(srcR2, srcR3, permP3);
359  } break;
360  case 13: {
361  vec_u8 srcR3 = vec_ld(30, src);
362  srcM2 = vec_perm(srcR1, srcR2, permM2);
363  srcM1 = vec_perm(srcR1, srcR2, permM1);
364  srcP0 = vec_perm(srcR1, srcR2, permP0);
365  srcP1 = srcR2;
366  srcP2 = vec_perm(srcR2, srcR3, permP2);
367  srcP3 = vec_perm(srcR2, srcR3, permP3);
368  } break;
369  case 14: {
370  vec_u8 srcR3 = vec_ld(30, src);
371  srcM2 = vec_perm(srcR1, srcR2, permM2);
372  srcM1 = vec_perm(srcR1, srcR2, permM1);
373  srcP0 = srcR2;
374  srcP1 = vec_perm(srcR2, srcR3, permP1);
375  srcP2 = vec_perm(srcR2, srcR3, permP2);
376  srcP3 = vec_perm(srcR2, srcR3, permP3);
377  } break;
378  case 15: {
379  vec_u8 srcR3 = vec_ld(30, src);
380  srcM2 = vec_perm(srcR1, srcR2, permM2);
381  srcM1 = srcR2;
382  srcP0 = vec_perm(srcR2, srcR3, permP0);
383  srcP1 = vec_perm(srcR2, srcR3, permP1);
384  srcP2 = vec_perm(srcR2, srcR3, permP2);
385  srcP3 = vec_perm(srcR2, srcR3, permP3);
386  } break;
387  }
388 
389  srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
390  srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
391  srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
392  srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
393 
394  srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
395  srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
396  srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
397  srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
398 
399  srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
400  srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
401  srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
402  srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
403 
404  sum1A = vec_adds(srcP0A, srcP1A);
405  sum1B = vec_adds(srcP0B, srcP1B);
406  sum2A = vec_adds(srcM1A, srcP2A);
407  sum2B = vec_adds(srcM1B, srcP2B);
408  sum3A = vec_adds(srcM2A, srcP3A);
409  sum3B = vec_adds(srcM2B, srcP3B);
410 
411  pp1A = vec_mladd(sum1A, v20ss, sum3A);
412  pp1B = vec_mladd(sum1B, v20ss, sum3B);
413 
414  pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
415  pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
416 
417  psumA = vec_sub(pp1A, pp2A);
418  psumB = vec_sub(pp1B, pp2B);
419 
420  vec_st(psumA, 0, tmp);
421  vec_st(psumB, 16, tmp);
422 
423  src += srcStride;
424  tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
425  }
426 
427  tmpM2ssA = vec_ld(0, tmpbis);
428  tmpM2ssB = vec_ld(16, tmpbis);
429  tmpbis += tmpStride;
430  tmpM1ssA = vec_ld(0, tmpbis);
431  tmpM1ssB = vec_ld(16, tmpbis);
432  tmpbis += tmpStride;
433  tmpP0ssA = vec_ld(0, tmpbis);
434  tmpP0ssB = vec_ld(16, tmpbis);
435  tmpbis += tmpStride;
436  tmpP1ssA = vec_ld(0, tmpbis);
437  tmpP1ssB = vec_ld(16, tmpbis);
438  tmpbis += tmpStride;
439  tmpP2ssA = vec_ld(0, tmpbis);
440  tmpP2ssB = vec_ld(16, tmpbis);
441  tmpbis += tmpStride;
442 
443  for (i = 0 ; i < 16 ; i++) {
444  const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
445  const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
446 
447  const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
448  const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
449  const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
450  const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
451  const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
452  const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
453 
454  tmpbis += tmpStride;
455 
456  tmpM2ssA = tmpM1ssA;
457  tmpM2ssB = tmpM1ssB;
458  tmpM1ssA = tmpP0ssA;
459  tmpM1ssB = tmpP0ssB;
460  tmpP0ssA = tmpP1ssA;
461  tmpP0ssB = tmpP1ssB;
462  tmpP1ssA = tmpP2ssA;
463  tmpP1ssB = tmpP2ssB;
464  tmpP2ssA = tmpP3ssA;
465  tmpP2ssB = tmpP3ssB;
466 
467  pp1Ae = vec_mule(sum1A, v20ss);
468  pp1Ao = vec_mulo(sum1A, v20ss);
469  pp1Be = vec_mule(sum1B, v20ss);
470  pp1Bo = vec_mulo(sum1B, v20ss);
471 
472  pp2Ae = vec_mule(sum2A, v5ss);
473  pp2Ao = vec_mulo(sum2A, v5ss);
474  pp2Be = vec_mule(sum2B, v5ss);
475  pp2Bo = vec_mulo(sum2B, v5ss);
476 
477  pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
478  pp3Ao = vec_mulo(sum3A, v1ss);
479  pp3Be = vec_sra((vec_s32)sum3B, v16ui);
480  pp3Bo = vec_mulo(sum3B, v1ss);
481 
482  pp1cAe = vec_add(pp1Ae, v512si);
483  pp1cAo = vec_add(pp1Ao, v512si);
484  pp1cBe = vec_add(pp1Be, v512si);
485  pp1cBo = vec_add(pp1Bo, v512si);
486 
487  pp32Ae = vec_sub(pp3Ae, pp2Ae);
488  pp32Ao = vec_sub(pp3Ao, pp2Ao);
489  pp32Be = vec_sub(pp3Be, pp2Be);
490  pp32Bo = vec_sub(pp3Bo, pp2Bo);
491 
492  sumAe = vec_add(pp1cAe, pp32Ae);
493  sumAo = vec_add(pp1cAo, pp32Ao);
494  sumBe = vec_add(pp1cBe, pp32Be);
495  sumBo = vec_add(pp1cBo, pp32Bo);
496 
497  ssumAe = vec_sra(sumAe, v10ui);
498  ssumAo = vec_sra(sumAo, v10ui);
499  ssumBe = vec_sra(sumBe, v10ui);
500  ssumBo = vec_sra(sumBo, v10ui);
501 
502  ssume = vec_packs(ssumAe, ssumBe);
503  ssumo = vec_packs(ssumAo, ssumBo);
504 
505  sumv = vec_packsu(ssume, ssumo);
506  sum = vec_perm(sumv, sumv, mperm);
507 
508  ASSERT_ALIGNED(dst);
509 
510  OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
511 
512  vec_st(fsum, 0, dst);
513 
514  dst += dstStride;
515  }
516 }
517 #endif