FFmpeg
me_cmp.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2002 Brian Foley
3  * Copyright (c) 2002 Dieter Shirley
4  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "config.h"
24 
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/ppc/cpu.h"
29 
30 #include "libavcodec/avcodec.h"
31 #include "libavcodec/mpegvideo.h"
32 #include "libavcodec/me_cmp.h"
33 
34 #if HAVE_ALTIVEC
35 
36 #if HAVE_BIGENDIAN
37 #define GET_PERM(per1, per2, pix) {\
38  per1 = vec_lvsl(0, pix);\
39  per2 = vec_add(per1, vec_splat_u8(1));\
40 }
41 #define LOAD_PIX(v, iv, pix, per1, per2) {\
42  vector unsigned char pix2l = vec_ld(0, pix);\
43  vector unsigned char pix2r = vec_ld(16, pix);\
44  v = vec_perm(pix2l, pix2r, per1);\
45  iv = vec_perm(pix2l, pix2r, per2);\
46 }
47 #else
48 #define GET_PERM(per1, per2, pix) {}
49 #define LOAD_PIX(v, iv, pix, per1, per2) {\
50  v = vec_vsx_ld(0, pix);\
51  iv = vec_vsx_ld(1, pix);\
52 }
53 #endif
54 static int sad16_x2_altivec(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
55  ptrdiff_t stride, int h)
56 {
57  int i;
58  int __attribute__((aligned(16))) s = 0;
59  const vector unsigned char zero =
60  (const vector unsigned char) vec_splat_u8(0);
61  vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
62  vector signed int sumdiffs;
63  vector unsigned char perm1, perm2, pix2v, pix2iv;
64 
65  GET_PERM(perm1, perm2, pix2);
66  for (i = 0; i < h; i++) {
67  /* Read unaligned pixels into our vectors. The vectors are as follows:
68  * pix1v: pix1[0] - pix1[15]
69  * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16] */
70  vector unsigned char pix1v = vec_ld(0, pix1);
71  LOAD_PIX(pix2v, pix2iv, pix2, perm1, perm2);
72 
73  /* Calculate the average vector. */
74  vector unsigned char avgv = vec_avg(pix2v, pix2iv);
75 
76  /* Calculate a sum of abs differences vector. */
77  vector unsigned char t5 = vec_sub(vec_max(pix1v, avgv),
78  vec_min(pix1v, avgv));
79 
80  /* Add each 4 pixel group together and put 4 results into sad. */
81  sad = vec_sum4s(t5, sad);
82 
83  pix1 += stride;
84  pix2 += stride;
85  }
86  /* Sum up the four partial sums, and put the result into s. */
87  sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
88  sumdiffs = vec_splat(sumdiffs, 3);
89  vec_ste(sumdiffs, 0, &s);
90 
91  return s;
92 }
93 
94 static int sad16_y2_altivec(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
95  ptrdiff_t stride, int h)
96 {
97  int i;
98  int __attribute__((aligned(16))) s = 0;
99  const vector unsigned char zero =
100  (const vector unsigned char) vec_splat_u8(0);
101  vector unsigned char pix1v, pix3v, avgv, t5;
102  vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
103  vector signed int sumdiffs;
104 
105  const uint8_t *pix3 = pix2 + stride;
106 
107  /* Due to the fact that pix3 = pix2 + stride, the pix3 of one
108  * iteration becomes pix2 in the next iteration. We can use this
109  * fact to avoid a potentially expensive unaligned read, each
110  * time around the loop.
111  * Read unaligned pixels into our vectors. The vectors are as follows:
112  * pix2v: pix2[0] - pix2[15]
113  * Split the pixel vectors into shorts. */
114  vector unsigned char pix2v = VEC_LD(0, pix2);
115 
116  for (i = 0; i < h; i++) {
117  /* Read unaligned pixels into our vectors. The vectors are as follows:
118  * pix1v: pix1[0] - pix1[15]
119  * pix3v: pix3[0] - pix3[15] */
120  pix1v = vec_ld(0, pix1);
121  pix3v = VEC_LD(0, pix3);
122 
123  /* Calculate the average vector. */
124  avgv = vec_avg(pix2v, pix3v);
125 
126  /* Calculate a sum of abs differences vector. */
127  t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
128 
129  /* Add each 4 pixel group together and put 4 results into sad. */
130  sad = vec_sum4s(t5, sad);
131 
132  pix1 += stride;
133  pix2v = pix3v;
134  pix3 += stride;
135  }
136 
137  /* Sum up the four partial sums, and put the result into s. */
138  sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
139  sumdiffs = vec_splat(sumdiffs, 3);
140  vec_ste(sumdiffs, 0, &s);
141  return s;
142 }
143 
144 static int sad16_xy2_altivec(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
145  ptrdiff_t stride, int h)
146 {
147  int i;
148  int __attribute__((aligned(16))) s = 0;
149  const uint8_t *pix3 = pix2 + stride;
150  const vector unsigned char zero =
151  (const vector unsigned char) vec_splat_u8(0);
152  const vector unsigned short two =
153  (const vector unsigned short) vec_splat_u16(2);
154  vector unsigned char avgv, t5;
155  vector unsigned char pix1v, pix3v, pix3iv;
156  vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
157  vector unsigned short avghv, avglv;
158  vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
159  vector signed int sumdiffs;
160  vector unsigned char perm1, perm2, pix2v, pix2iv;
161  GET_PERM(perm1, perm2, pix2);
162 
163  /* Due to the fact that pix3 = pix2 + stride, the pix3 of one
164  * iteration becomes pix2 in the next iteration. We can use this
165  * fact to avoid a potentially expensive unaligned read, as well
166  * as some splitting, and vector addition each time around the loop.
167  * Read unaligned pixels into our vectors. The vectors are as follows:
168  * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16]
169  * Split the pixel vectors into shorts. */
170  LOAD_PIX(pix2v, pix2iv, pix2, perm1, perm2);
171  vector unsigned short pix2hv =
172  (vector unsigned short) VEC_MERGEH(zero, pix2v);
173  vector unsigned short pix2lv =
174  (vector unsigned short) VEC_MERGEL(zero, pix2v);
175  vector unsigned short pix2ihv =
176  (vector unsigned short) VEC_MERGEH(zero, pix2iv);
177  vector unsigned short pix2ilv =
178  (vector unsigned short) VEC_MERGEL(zero, pix2iv);
179 
180  vector unsigned short t1 = vec_add(pix2hv, pix2ihv);
181  vector unsigned short t2 = vec_add(pix2lv, pix2ilv);
182  vector unsigned short t3, t4;
183 
184  for (i = 0; i < h; i++) {
185  /* Read unaligned pixels into our vectors. The vectors are as follows:
186  * pix1v: pix1[0] - pix1[15]
187  * pix3v: pix3[0] - pix3[15] pix3iv: pix3[1] - pix3[16] */
188  pix1v = vec_ld(0, pix1);
189  LOAD_PIX(pix3v, pix3iv, pix3, perm1, perm2);
190 
191  /* Note that AltiVec does have vec_avg, but this works on vector pairs
192  * and rounds up. We could do avg(avg(a, b), avg(c, d)), but the
193  * rounding would mean that, for example, avg(3, 0, 0, 1) = 2, when
194  * it should be 1. Instead, we have to split the pixel vectors into
195  * vectors of shorts and do the averaging by hand. */
196 
197  /* Split the pixel vectors into shorts. */
198  pix3hv = (vector unsigned short) VEC_MERGEH(zero, pix3v);
199  pix3lv = (vector unsigned short) VEC_MERGEL(zero, pix3v);
200  pix3ihv = (vector unsigned short) VEC_MERGEH(zero, pix3iv);
201  pix3ilv = (vector unsigned short) VEC_MERGEL(zero, pix3iv);
202 
203  /* Do the averaging on them. */
204  t3 = vec_add(pix3hv, pix3ihv);
205  t4 = vec_add(pix3lv, pix3ilv);
206 
207  avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
208  avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
209 
210  /* Pack the shorts back into a result. */
211  avgv = vec_pack(avghv, avglv);
212 
213  /* Calculate a sum of abs differences vector. */
214  t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
215 
216  /* Add each 4 pixel group together and put 4 results into sad. */
217  sad = vec_sum4s(t5, sad);
218 
219  pix1 += stride;
220  pix3 += stride;
221  /* Transfer the calculated values for pix3 into pix2. */
222  t1 = t3;
223  t2 = t4;
224  }
225  /* Sum up the four partial sums, and put the result into s. */
226  sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
227  sumdiffs = vec_splat(sumdiffs, 3);
228  vec_ste(sumdiffs, 0, &s);
229 
230  return s;
231 }
232 
233 static int sad16_altivec(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
234  ptrdiff_t stride, int h)
235 {
236  int i;
237  int __attribute__((aligned(16))) s;
238  const vector unsigned int zero =
239  (const vector unsigned int) vec_splat_u32(0);
240  vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
241  vector signed int sumdiffs;
242 
243  for (i = 0; i < h; i++) {
244  /* Read potentially unaligned pixels into t1 and t2. */
245  vector unsigned char t1 =vec_ld(0, pix1);
246  vector unsigned char t2 = VEC_LD(0, pix2);
247 
248  /* Calculate a sum of abs differences vector. */
249  vector unsigned char t3 = vec_max(t1, t2);
250  vector unsigned char t4 = vec_min(t1, t2);
251  vector unsigned char t5 = vec_sub(t3, t4);
252 
253  /* Add each 4 pixel group together and put 4 results into sad. */
254  sad = vec_sum4s(t5, sad);
255 
256  pix1 += stride;
257  pix2 += stride;
258  }
259 
260  /* Sum up the four partial sums, and put the result into s. */
261  sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
262  sumdiffs = vec_splat(sumdiffs, 3);
263  vec_ste(sumdiffs, 0, &s);
264 
265  return s;
266 }
267 
268 static int sad8_altivec(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
269  ptrdiff_t stride, int h)
270 {
271  int i;
272  int __attribute__((aligned(16))) s;
273  const vector unsigned int zero =
274  (const vector unsigned int) vec_splat_u32(0);
275  const vector unsigned char permclear =
276  (vector unsigned char)
277  { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
278  vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
279  vector signed int sumdiffs;
280 
281  for (i = 0; i < h; i++) {
282  /* Read potentially unaligned pixels into t1 and t2.
283  * Since we're reading 16 pixels, and actually only want 8,
284  * mask out the last 8 pixels. The 0s don't change the sum. */
285  vector unsigned char pix1l = VEC_LD(0, pix1);
286  vector unsigned char pix2l = VEC_LD(0, pix2);
287  vector unsigned char t1 = vec_and(pix1l, permclear);
288  vector unsigned char t2 = vec_and(pix2l, permclear);
289 
290  /* Calculate a sum of abs differences vector. */
291  vector unsigned char t3 = vec_max(t1, t2);
292  vector unsigned char t4 = vec_min(t1, t2);
293  vector unsigned char t5 = vec_sub(t3, t4);
294 
295  /* Add each 4 pixel group together and put 4 results into sad. */
296  sad = vec_sum4s(t5, sad);
297 
298  pix1 += stride;
299  pix2 += stride;
300  }
301 
302  /* Sum up the four partial sums, and put the result into s. */
303  sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
304  sumdiffs = vec_splat(sumdiffs, 3);
305  vec_ste(sumdiffs, 0, &s);
306 
307  return s;
308 }
309 
310 /* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced.
311  * It's the sad8_altivec code above w/ squaring added. */
312 static int sse8_altivec(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
313  ptrdiff_t stride, int h)
314 {
315  int i;
316  int __attribute__((aligned(16))) s;
317  const vector unsigned int zero =
318  (const vector unsigned int) vec_splat_u32(0);
319  const vector unsigned char permclear =
320  (vector unsigned char)
321  { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
322  vector unsigned int sum = (vector unsigned int) vec_splat_u32(0);
323  vector signed int sumsqr;
324 
325  for (i = 0; i < h; i++) {
326  /* Read potentially unaligned pixels into t1 and t2.
327  * Since we're reading 16 pixels, and actually only want 8,
328  * mask out the last 8 pixels. The 0s don't change the sum. */
329  vector unsigned char t1 = vec_and(VEC_LD(0, pix1), permclear);
330  vector unsigned char t2 = vec_and(VEC_LD(0, pix2), permclear);
331 
332  /* Since we want to use unsigned chars, we can take advantage
333  * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
334 
335  /* Calculate abs differences vector. */
336  vector unsigned char t3 = vec_max(t1, t2);
337  vector unsigned char t4 = vec_min(t1, t2);
338  vector unsigned char t5 = vec_sub(t3, t4);
339 
340  /* Square the values and add them to our sum. */
341  sum = vec_msum(t5, t5, sum);
342 
343  pix1 += stride;
344  pix2 += stride;
345  }
346 
347  /* Sum up the four partial sums, and put the result into s. */
348  sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
349  sumsqr = vec_splat(sumsqr, 3);
350  vec_ste(sumsqr, 0, &s);
351 
352  return s;
353 }
354 
355 /* Sum of Squared Errors for a 16x16 block, AltiVec-enhanced.
356  * It's the sad16_altivec code above w/ squaring added. */
357 static int sse16_altivec(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
358  ptrdiff_t stride, int h)
359 {
360  int i;
361  int __attribute__((aligned(16))) s;
362  const vector unsigned int zero =
363  (const vector unsigned int) vec_splat_u32(0);
364  vector unsigned int sum = (vector unsigned int) vec_splat_u32(0);
365  vector signed int sumsqr;
366 
367  for (i = 0; i < h; i++) {
368  /* Read potentially unaligned pixels into t1 and t2. */
369  vector unsigned char t1 = vec_ld(0, pix1);
370  vector unsigned char t2 = VEC_LD(0, pix2);
371 
372  /* Since we want to use unsigned chars, we can take advantage
373  * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
374 
375  /* Calculate abs differences vector. */
376  vector unsigned char t3 = vec_max(t1, t2);
377  vector unsigned char t4 = vec_min(t1, t2);
378  vector unsigned char t5 = vec_sub(t3, t4);
379 
380  /* Square the values and add them to our sum. */
381  sum = vec_msum(t5, t5, sum);
382 
383  pix1 += stride;
384  pix2 += stride;
385  }
386 
387  /* Sum up the four partial sums, and put the result into s. */
388  sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
389  sumsqr = vec_splat(sumsqr, 3);
390 
391  vec_ste(sumsqr, 0, &s);
392  return s;
393 }
394 
395 static int hadamard8_diff8x8_altivec(MpegEncContext *s, const uint8_t *dst,
396  const uint8_t *src, ptrdiff_t stride, int h)
397 {
398  int __attribute__((aligned(16))) sum;
399  register const vector unsigned char vzero =
400  (const vector unsigned char) vec_splat_u8(0);
401  register vector signed short temp0, temp1, temp2, temp3, temp4,
402  temp5, temp6, temp7;
403  {
404  register const vector signed short vprod1 =
405  (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 };
406  register const vector signed short vprod2 =
407  (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 };
408  register const vector signed short vprod3 =
409  (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 };
410  register const vector unsigned char perm1 =
411  (const vector unsigned char)
412  { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
413  0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D };
414  register const vector unsigned char perm2 =
415  (const vector unsigned char)
416  { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
417  0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B };
418  register const vector unsigned char perm3 =
419  (const vector unsigned char)
420  { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
421  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
422 
423 
424 #define ONEITERBUTTERFLY(i, res) \
425  { \
426  register vector unsigned char srcO = unaligned_load(stride * i, src); \
427  register vector unsigned char dstO = unaligned_load(stride * i, dst);\
428  \
429  /* Promote the unsigned chars to signed shorts. */ \
430  /* We're in the 8x8 function, we only care for the first 8. */ \
431  register vector signed short srcV = \
432  (vector signed short) VEC_MERGEH((vector signed char) vzero, \
433  (vector signed char) srcO); \
434  register vector signed short dstV = \
435  (vector signed short) VEC_MERGEH((vector signed char) vzero, \
436  (vector signed char) dstO); \
437  \
438  /* subtractions inside the first butterfly */ \
439  register vector signed short but0 = vec_sub(srcV, dstV); \
440  register vector signed short op1 = vec_perm(but0, but0, perm1); \
441  register vector signed short but1 = vec_mladd(but0, vprod1, op1); \
442  register vector signed short op2 = vec_perm(but1, but1, perm2); \
443  register vector signed short but2 = vec_mladd(but1, vprod2, op2); \
444  register vector signed short op3 = vec_perm(but2, but2, perm3); \
445  res = vec_mladd(but2, vprod3, op3); \
446  }
447 
448  ONEITERBUTTERFLY(0, temp0);
449  ONEITERBUTTERFLY(1, temp1);
450  ONEITERBUTTERFLY(2, temp2);
451  ONEITERBUTTERFLY(3, temp3);
452  ONEITERBUTTERFLY(4, temp4);
453  ONEITERBUTTERFLY(5, temp5);
454  ONEITERBUTTERFLY(6, temp6);
455  ONEITERBUTTERFLY(7, temp7);
456  }
457 #undef ONEITERBUTTERFLY
458  {
459  register vector signed int vsum;
460  register vector signed short line0 = vec_add(temp0, temp1);
461  register vector signed short line1 = vec_sub(temp0, temp1);
462  register vector signed short line2 = vec_add(temp2, temp3);
463  register vector signed short line3 = vec_sub(temp2, temp3);
464  register vector signed short line4 = vec_add(temp4, temp5);
465  register vector signed short line5 = vec_sub(temp4, temp5);
466  register vector signed short line6 = vec_add(temp6, temp7);
467  register vector signed short line7 = vec_sub(temp6, temp7);
468 
469  register vector signed short line0B = vec_add(line0, line2);
470  register vector signed short line2B = vec_sub(line0, line2);
471  register vector signed short line1B = vec_add(line1, line3);
472  register vector signed short line3B = vec_sub(line1, line3);
473  register vector signed short line4B = vec_add(line4, line6);
474  register vector signed short line6B = vec_sub(line4, line6);
475  register vector signed short line5B = vec_add(line5, line7);
476  register vector signed short line7B = vec_sub(line5, line7);
477 
478  register vector signed short line0C = vec_add(line0B, line4B);
479  register vector signed short line4C = vec_sub(line0B, line4B);
480  register vector signed short line1C = vec_add(line1B, line5B);
481  register vector signed short line5C = vec_sub(line1B, line5B);
482  register vector signed short line2C = vec_add(line2B, line6B);
483  register vector signed short line6C = vec_sub(line2B, line6B);
484  register vector signed short line3C = vec_add(line3B, line7B);
485  register vector signed short line7C = vec_sub(line3B, line7B);
486 
487  vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
488  vsum = vec_sum4s(vec_abs(line1C), vsum);
489  vsum = vec_sum4s(vec_abs(line2C), vsum);
490  vsum = vec_sum4s(vec_abs(line3C), vsum);
491  vsum = vec_sum4s(vec_abs(line4C), vsum);
492  vsum = vec_sum4s(vec_abs(line5C), vsum);
493  vsum = vec_sum4s(vec_abs(line6C), vsum);
494  vsum = vec_sum4s(vec_abs(line7C), vsum);
495  vsum = vec_sums(vsum, (vector signed int) vzero);
496  vsum = vec_splat(vsum, 3);
497 
498  vec_ste(vsum, 0, &sum);
499  }
500  return sum;
501 }
502 
503 /*
504  * 16x8 works with 16 elements; it can avoid replicating loads, and
505  * gives the compiler more room for scheduling. It's only used from
506  * inside hadamard8_diff16_altivec.
507  *
508  * Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has
509  * a LOT of spill code, it seems gcc (unlike xlc) cannot keep everything in
510  * registers by itself. The following code includes hand-made register
511  * allocation. It's not clean, but on a 7450 the resulting code is much faster
512  * (best case falls from 700+ cycles to 550).
513  *
514  * xlc doesn't add spill code, but it doesn't know how to schedule for the
515  * 7450, and its code isn't much faster than gcc-3.3 on the 7450 (but uses
516  * 25% fewer instructions...)
517  *
518  * On the 970, the hand-made RA is still a win (around 690 vs. around 780),
519  * but xlc goes to around 660 on the regular C code...
520  */
521 static int hadamard8_diff16x8_altivec(MpegEncContext *s, const uint8_t *dst,
522  const uint8_t *src, ptrdiff_t stride, int h)
523 {
524  int __attribute__((aligned(16))) sum;
525  register vector signed short
526  temp0 __asm__ ("v0"),
527  temp1 __asm__ ("v1"),
528  temp2 __asm__ ("v2"),
529  temp3 __asm__ ("v3"),
530  temp4 __asm__ ("v4"),
531  temp5 __asm__ ("v5"),
532  temp6 __asm__ ("v6"),
533  temp7 __asm__ ("v7");
534  register vector signed short
535  temp0S __asm__ ("v8"),
536  temp1S __asm__ ("v9"),
537  temp2S __asm__ ("v10"),
538  temp3S __asm__ ("v11"),
539  temp4S __asm__ ("v12"),
540  temp5S __asm__ ("v13"),
541  temp6S __asm__ ("v14"),
542  temp7S __asm__ ("v15");
543  register const vector unsigned char vzero __asm__ ("v31") =
544  (const vector unsigned char) vec_splat_u8(0);
545  {
546  register const vector signed short vprod1 __asm__ ("v16") =
547  (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 };
548 
549  register const vector signed short vprod2 __asm__ ("v17") =
550  (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 };
551 
552  register const vector signed short vprod3 __asm__ ("v18") =
553  (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 };
554 
555  register const vector unsigned char perm1 __asm__ ("v19") =
556  (const vector unsigned char)
557  { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
558  0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D };
559 
560  register const vector unsigned char perm2 __asm__ ("v20") =
561  (const vector unsigned char)
562  { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
563  0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B };
564 
565  register const vector unsigned char perm3 __asm__ ("v21") =
566  (const vector unsigned char)
567  { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
568  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
569 
570 #define ONEITERBUTTERFLY(i, res1, res2) \
571  { \
572  register vector unsigned char srcO __asm__ ("v22") = \
573  unaligned_load(stride * i, src); \
574  register vector unsigned char dstO __asm__ ("v23") = \
575  unaligned_load(stride * i, dst);\
576  \
577  /* Promote the unsigned chars to signed shorts. */ \
578  register vector signed short srcV __asm__ ("v24") = \
579  (vector signed short) VEC_MERGEH((vector signed char) vzero, \
580  (vector signed char) srcO); \
581  register vector signed short dstV __asm__ ("v25") = \
582  (vector signed short) VEC_MERGEH((vector signed char) vzero, \
583  (vector signed char) dstO); \
584  register vector signed short srcW __asm__ ("v26") = \
585  (vector signed short) VEC_MERGEL((vector signed char) vzero, \
586  (vector signed char) srcO); \
587  register vector signed short dstW __asm__ ("v27") = \
588  (vector signed short) VEC_MERGEL((vector signed char) vzero, \
589  (vector signed char) dstO); \
590  \
591  /* subtractions inside the first butterfly */ \
592  register vector signed short but0 __asm__ ("v28") = \
593  vec_sub(srcV, dstV); \
594  register vector signed short but0S __asm__ ("v29") = \
595  vec_sub(srcW, dstW); \
596  register vector signed short op1 __asm__ ("v30") = \
597  vec_perm(but0, but0, perm1); \
598  register vector signed short but1 __asm__ ("v22") = \
599  vec_mladd(but0, vprod1, op1); \
600  register vector signed short op1S __asm__ ("v23") = \
601  vec_perm(but0S, but0S, perm1); \
602  register vector signed short but1S __asm__ ("v24") = \
603  vec_mladd(but0S, vprod1, op1S); \
604  register vector signed short op2 __asm__ ("v25") = \
605  vec_perm(but1, but1, perm2); \
606  register vector signed short but2 __asm__ ("v26") = \
607  vec_mladd(but1, vprod2, op2); \
608  register vector signed short op2S __asm__ ("v27") = \
609  vec_perm(but1S, but1S, perm2); \
610  register vector signed short but2S __asm__ ("v28") = \
611  vec_mladd(but1S, vprod2, op2S); \
612  register vector signed short op3 __asm__ ("v29") = \
613  vec_perm(but2, but2, perm3); \
614  register vector signed short op3S __asm__ ("v30") = \
615  vec_perm(but2S, but2S, perm3); \
616  res1 = vec_mladd(but2, vprod3, op3); \
617  res2 = vec_mladd(but2S, vprod3, op3S); \
618  }
619 
620  ONEITERBUTTERFLY(0, temp0, temp0S);
621  ONEITERBUTTERFLY(1, temp1, temp1S);
622  ONEITERBUTTERFLY(2, temp2, temp2S);
623  ONEITERBUTTERFLY(3, temp3, temp3S);
624  ONEITERBUTTERFLY(4, temp4, temp4S);
625  ONEITERBUTTERFLY(5, temp5, temp5S);
626  ONEITERBUTTERFLY(6, temp6, temp6S);
627  ONEITERBUTTERFLY(7, temp7, temp7S);
628  }
629 #undef ONEITERBUTTERFLY
630  {
631  register vector signed int vsum;
632 
633  register vector signed short line0 = vec_add(temp0, temp1);
634  register vector signed short line1 = vec_sub(temp0, temp1);
635  register vector signed short line2 = vec_add(temp2, temp3);
636  register vector signed short line3 = vec_sub(temp2, temp3);
637  register vector signed short line4 = vec_add(temp4, temp5);
638  register vector signed short line5 = vec_sub(temp4, temp5);
639  register vector signed short line6 = vec_add(temp6, temp7);
640  register vector signed short line7 = vec_sub(temp6, temp7);
641 
642  register vector signed short line0B = vec_add(line0, line2);
643  register vector signed short line2B = vec_sub(line0, line2);
644  register vector signed short line1B = vec_add(line1, line3);
645  register vector signed short line3B = vec_sub(line1, line3);
646  register vector signed short line4B = vec_add(line4, line6);
647  register vector signed short line6B = vec_sub(line4, line6);
648  register vector signed short line5B = vec_add(line5, line7);
649  register vector signed short line7B = vec_sub(line5, line7);
650 
651  register vector signed short line0C = vec_add(line0B, line4B);
652  register vector signed short line4C = vec_sub(line0B, line4B);
653  register vector signed short line1C = vec_add(line1B, line5B);
654  register vector signed short line5C = vec_sub(line1B, line5B);
655  register vector signed short line2C = vec_add(line2B, line6B);
656  register vector signed short line6C = vec_sub(line2B, line6B);
657  register vector signed short line3C = vec_add(line3B, line7B);
658  register vector signed short line7C = vec_sub(line3B, line7B);
659 
660  register vector signed short line0S = vec_add(temp0S, temp1S);
661  register vector signed short line1S = vec_sub(temp0S, temp1S);
662  register vector signed short line2S = vec_add(temp2S, temp3S);
663  register vector signed short line3S = vec_sub(temp2S, temp3S);
664  register vector signed short line4S = vec_add(temp4S, temp5S);
665  register vector signed short line5S = vec_sub(temp4S, temp5S);
666  register vector signed short line6S = vec_add(temp6S, temp7S);
667  register vector signed short line7S = vec_sub(temp6S, temp7S);
668 
669  register vector signed short line0BS = vec_add(line0S, line2S);
670  register vector signed short line2BS = vec_sub(line0S, line2S);
671  register vector signed short line1BS = vec_add(line1S, line3S);
672  register vector signed short line3BS = vec_sub(line1S, line3S);
673  register vector signed short line4BS = vec_add(line4S, line6S);
674  register vector signed short line6BS = vec_sub(line4S, line6S);
675  register vector signed short line5BS = vec_add(line5S, line7S);
676  register vector signed short line7BS = vec_sub(line5S, line7S);
677 
678  register vector signed short line0CS = vec_add(line0BS, line4BS);
679  register vector signed short line4CS = vec_sub(line0BS, line4BS);
680  register vector signed short line1CS = vec_add(line1BS, line5BS);
681  register vector signed short line5CS = vec_sub(line1BS, line5BS);
682  register vector signed short line2CS = vec_add(line2BS, line6BS);
683  register vector signed short line6CS = vec_sub(line2BS, line6BS);
684  register vector signed short line3CS = vec_add(line3BS, line7BS);
685  register vector signed short line7CS = vec_sub(line3BS, line7BS);
686 
687  vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
688  vsum = vec_sum4s(vec_abs(line1C), vsum);
689  vsum = vec_sum4s(vec_abs(line2C), vsum);
690  vsum = vec_sum4s(vec_abs(line3C), vsum);
691  vsum = vec_sum4s(vec_abs(line4C), vsum);
692  vsum = vec_sum4s(vec_abs(line5C), vsum);
693  vsum = vec_sum4s(vec_abs(line6C), vsum);
694  vsum = vec_sum4s(vec_abs(line7C), vsum);
695 
696  vsum = vec_sum4s(vec_abs(line0CS), vsum);
697  vsum = vec_sum4s(vec_abs(line1CS), vsum);
698  vsum = vec_sum4s(vec_abs(line2CS), vsum);
699  vsum = vec_sum4s(vec_abs(line3CS), vsum);
700  vsum = vec_sum4s(vec_abs(line4CS), vsum);
701  vsum = vec_sum4s(vec_abs(line5CS), vsum);
702  vsum = vec_sum4s(vec_abs(line6CS), vsum);
703  vsum = vec_sum4s(vec_abs(line7CS), vsum);
704  vsum = vec_sums(vsum, (vector signed int) vzero);
705  vsum = vec_splat(vsum, 3);
706 
707  vec_ste(vsum, 0, &sum);
708  }
709  return sum;
710 }
711 
712 static int hadamard8_diff16_altivec(MpegEncContext *s, const uint8_t *dst,
713  const uint8_t *src, ptrdiff_t stride, int h)
714 {
715  int score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
716 
717  if (h == 16) {
718  dst += 8 * stride;
719  src += 8 * stride;
720  score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
721  }
722  return score;
723 }
724 #endif /* HAVE_ALTIVEC */
725 
727 {
728 #if HAVE_ALTIVEC
730  return;
731 
732  c->pix_abs[0][1] = sad16_x2_altivec;
733  c->pix_abs[0][2] = sad16_y2_altivec;
734  c->pix_abs[0][3] = sad16_xy2_altivec;
735  c->pix_abs[0][0] = sad16_altivec;
736  c->pix_abs[1][0] = sad8_altivec;
737 
738  c->sad[0] = sad16_altivec;
739  c->sad[1] = sad8_altivec;
740  c->sse[0] = sse16_altivec;
741  c->sse[1] = sse8_altivec;
742 
743  c->hadamard8_diff[0] = hadamard8_diff16_altivec;
744  c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
745 #endif /* HAVE_ALTIVEC */
746 }
mpegvideo.h
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
aligned
static int aligned(int val)
Definition: dashdec.c:171
av_cold
#define av_cold
Definition: attributes.h:90
s
#define s(width, name)
Definition: cbs_vp9.c:198
MECmpContext
Definition: me_cmp.h:55
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
PPC_ALTIVEC
#define PPC_ALTIVEC(flags)
Definition: cpu.h:25
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:83
cpu.h
for
for(k=2;k<=8;++k)
Definition: h264pred_template.c:424
attributes.h
zero
static int zero(InterplayACMContext *s, unsigned ind, unsigned col)
Definition: interplayacm.c:121
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
avcodec.h
stride
#define stride
Definition: h264pred_template.c:536
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
me_cmp.h
AVCodecContext
main external API structure.
Definition: avcodec.h:451
util_altivec.h
ff_me_cmp_init_ppc
av_cold void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx)
Definition: me_cmp.c:726
cpu.h
h
h
Definition: vp9dsp_template.c:2070
MpegEncContext
MpegEncContext.
Definition: mpegvideo.h:73
short
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf default minimum maximum flags name is the option keep it simple and lowercase description are short
Definition: writing_filters.txt:89
src
#define src
Definition: vp8dsp.c:248