FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
hpeldsp_altivec.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2002 Brian Foley
3  * Copyright (c) 2002 Dieter Shirley
4  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "config.h"
24 
25 #if HAVE_ALTIVEC_H
26 #include <altivec.h>
27 #endif
28 
29 #include "libavutil/attributes.h"
30 #include "libavutil/cpu.h"
33 #include "libavcodec/hpeldsp.h"
34 #include "dsputil_altivec.h"
35 
36 #if HAVE_ALTIVEC
37 /* next one assumes that ((line_size % 16) == 0) */
38 void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
39 {
40  register vector unsigned char pixelsv1, pixelsv2;
41  register vector unsigned char pixelsv1B, pixelsv2B;
42  register vector unsigned char pixelsv1C, pixelsv2C;
43  register vector unsigned char pixelsv1D, pixelsv2D;
44 
45  register vector unsigned char perm = vec_lvsl(0, pixels);
46  int i;
47  register ptrdiff_t line_size_2 = line_size << 1;
48  register ptrdiff_t line_size_3 = line_size + line_size_2;
49  register ptrdiff_t line_size_4 = line_size << 2;
50 
51 // hand-unrolling the loop by 4 gains about 15%
52 // mininum execution time goes from 74 to 60 cycles
53 // it's faster than -funroll-loops, but using
54 // -funroll-loops w/ this is bad - 74 cycles again.
55 // all this is on a 7450, tuning for the 7450
56  for (i = 0; i < h; i += 4) {
57  pixelsv1 = vec_ld( 0, pixels);
58  pixelsv2 = vec_ld(15, pixels);
59  pixelsv1B = vec_ld(line_size, pixels);
60  pixelsv2B = vec_ld(15 + line_size, pixels);
61  pixelsv1C = vec_ld(line_size_2, pixels);
62  pixelsv2C = vec_ld(15 + line_size_2, pixels);
63  pixelsv1D = vec_ld(line_size_3, pixels);
64  pixelsv2D = vec_ld(15 + line_size_3, pixels);
65  vec_st(vec_perm(pixelsv1, pixelsv2, perm),
66  0, (unsigned char*)block);
67  vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
68  line_size, (unsigned char*)block);
69  vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
70  line_size_2, (unsigned char*)block);
71  vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
72  line_size_3, (unsigned char*)block);
73  pixels+=line_size_4;
74  block +=line_size_4;
75  }
76 }
77 
78 /* next one assumes that ((line_size % 16) == 0) */
79 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
80 void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
81 {
82  register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
83  register vector unsigned char perm = vec_lvsl(0, pixels);
84  int i;
85 
86  for (i = 0; i < h; i++) {
87  pixelsv1 = vec_ld( 0, pixels);
88  pixelsv2 = vec_ld(16,pixels);
89  blockv = vec_ld(0, block);
90  pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
91  blockv = vec_avg(blockv,pixelsv);
92  vec_st(blockv, 0, (unsigned char*)block);
93  pixels+=line_size;
94  block +=line_size;
95  }
96 }
97 
98 /* next one assumes that ((line_size % 8) == 0) */
99 static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
100 {
101  register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
102  int i;
103 
104  for (i = 0; i < h; i++) {
105  /* block is 8 bytes-aligned, so we're either in the
106  left block (16 bytes-aligned) or in the right block (not) */
107  int rightside = ((unsigned long)block & 0x0000000F);
108 
109  blockv = vec_ld(0, block);
110  pixelsv1 = vec_ld( 0, pixels);
111  pixelsv2 = vec_ld(16, pixels);
112  pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
113 
114  if (rightside) {
115  pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
116  } else {
117  pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
118  }
119 
120  blockv = vec_avg(blockv, pixelsv);
121 
122  vec_st(blockv, 0, block);
123 
124  pixels += line_size;
125  block += line_size;
126  }
127 }
128 
129 /* next one assumes that ((line_size % 8) == 0) */
130 static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
131 {
132  register int i;
133  register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
134  register vector unsigned char blockv, temp1, temp2;
135  register vector unsigned short pixelssum1, pixelssum2, temp3;
136  register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
137  register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
138 
139  temp1 = vec_ld(0, pixels);
140  temp2 = vec_ld(16, pixels);
141  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
142  if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
143  pixelsv2 = temp2;
144  } else {
145  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
146  }
147  pixelsv1 = vec_mergeh(vczero, pixelsv1);
148  pixelsv2 = vec_mergeh(vczero, pixelsv2);
149  pixelssum1 = vec_add((vector unsigned short)pixelsv1,
150  (vector unsigned short)pixelsv2);
151  pixelssum1 = vec_add(pixelssum1, vctwo);
152 
153  for (i = 0; i < h ; i++) {
154  int rightside = ((unsigned long)block & 0x0000000F);
155  blockv = vec_ld(0, block);
156 
157  temp1 = vec_ld(line_size, pixels);
158  temp2 = vec_ld(line_size + 16, pixels);
159  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
160  if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
161  pixelsv2 = temp2;
162  } else {
163  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
164  }
165 
166  pixelsv1 = vec_mergeh(vczero, pixelsv1);
167  pixelsv2 = vec_mergeh(vczero, pixelsv2);
168  pixelssum2 = vec_add((vector unsigned short)pixelsv1,
169  (vector unsigned short)pixelsv2);
170  temp3 = vec_add(pixelssum1, pixelssum2);
171  temp3 = vec_sra(temp3, vctwo);
172  pixelssum1 = vec_add(pixelssum2, vctwo);
173  pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
174 
175  if (rightside) {
176  blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
177  } else {
178  blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
179  }
180 
181  vec_st(blockv, 0, block);
182 
183  block += line_size;
184  pixels += line_size;
185  }
186 }
187 
188 /* next one assumes that ((line_size % 8) == 0) */
189 static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
190 {
191  register int i;
192  register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
193  register vector unsigned char blockv, temp1, temp2;
194  register vector unsigned short pixelssum1, pixelssum2, temp3;
195  register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
196  register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
197  register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
198 
199  temp1 = vec_ld(0, pixels);
200  temp2 = vec_ld(16, pixels);
201  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
202  if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
203  pixelsv2 = temp2;
204  } else {
205  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
206  }
207  pixelsv1 = vec_mergeh(vczero, pixelsv1);
208  pixelsv2 = vec_mergeh(vczero, pixelsv2);
209  pixelssum1 = vec_add((vector unsigned short)pixelsv1,
210  (vector unsigned short)pixelsv2);
211  pixelssum1 = vec_add(pixelssum1, vcone);
212 
213  for (i = 0; i < h ; i++) {
214  int rightside = ((unsigned long)block & 0x0000000F);
215  blockv = vec_ld(0, block);
216 
217  temp1 = vec_ld(line_size, pixels);
218  temp2 = vec_ld(line_size + 16, pixels);
219  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
220  if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
221  pixelsv2 = temp2;
222  } else {
223  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
224  }
225 
226  pixelsv1 = vec_mergeh(vczero, pixelsv1);
227  pixelsv2 = vec_mergeh(vczero, pixelsv2);
228  pixelssum2 = vec_add((vector unsigned short)pixelsv1,
229  (vector unsigned short)pixelsv2);
230  temp3 = vec_add(pixelssum1, pixelssum2);
231  temp3 = vec_sra(temp3, vctwo);
232  pixelssum1 = vec_add(pixelssum2, vcone);
233  pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
234 
235  if (rightside) {
236  blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
237  } else {
238  blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
239  }
240 
241  vec_st(blockv, 0, block);
242 
243  block += line_size;
244  pixels += line_size;
245  }
246 }
247 
248 /* next one assumes that ((line_size % 16) == 0) */
249 static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
250 {
251  register int i;
252  register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
253  register vector unsigned char blockv, temp1, temp2;
254  register vector unsigned short temp3, temp4,
255  pixelssum1, pixelssum2, pixelssum3, pixelssum4;
256  register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
257  register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
258 
259  temp1 = vec_ld(0, pixels);
260  temp2 = vec_ld(16, pixels);
261  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
262  if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
263  pixelsv2 = temp2;
264  } else {
265  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
266  }
267  pixelsv3 = vec_mergel(vczero, pixelsv1);
268  pixelsv4 = vec_mergel(vczero, pixelsv2);
269  pixelsv1 = vec_mergeh(vczero, pixelsv1);
270  pixelsv2 = vec_mergeh(vczero, pixelsv2);
271  pixelssum3 = vec_add((vector unsigned short)pixelsv3,
272  (vector unsigned short)pixelsv4);
273  pixelssum3 = vec_add(pixelssum3, vctwo);
274  pixelssum1 = vec_add((vector unsigned short)pixelsv1,
275  (vector unsigned short)pixelsv2);
276  pixelssum1 = vec_add(pixelssum1, vctwo);
277 
278  for (i = 0; i < h ; i++) {
279  blockv = vec_ld(0, block);
280 
281  temp1 = vec_ld(line_size, pixels);
282  temp2 = vec_ld(line_size + 16, pixels);
283  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
284  if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
285  pixelsv2 = temp2;
286  } else {
287  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
288  }
289 
290  pixelsv3 = vec_mergel(vczero, pixelsv1);
291  pixelsv4 = vec_mergel(vczero, pixelsv2);
292  pixelsv1 = vec_mergeh(vczero, pixelsv1);
293  pixelsv2 = vec_mergeh(vczero, pixelsv2);
294 
295  pixelssum4 = vec_add((vector unsigned short)pixelsv3,
296  (vector unsigned short)pixelsv4);
297  pixelssum2 = vec_add((vector unsigned short)pixelsv1,
298  (vector unsigned short)pixelsv2);
299  temp4 = vec_add(pixelssum3, pixelssum4);
300  temp4 = vec_sra(temp4, vctwo);
301  temp3 = vec_add(pixelssum1, pixelssum2);
302  temp3 = vec_sra(temp3, vctwo);
303 
304  pixelssum3 = vec_add(pixelssum4, vctwo);
305  pixelssum1 = vec_add(pixelssum2, vctwo);
306 
307  blockv = vec_packsu(temp3, temp4);
308 
309  vec_st(blockv, 0, block);
310 
311  block += line_size;
312  pixels += line_size;
313  }
314 }
315 
316 /* next one assumes that ((line_size % 16) == 0) */
317 static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
318 {
319  register int i;
320  register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
321  register vector unsigned char blockv, temp1, temp2;
322  register vector unsigned short temp3, temp4,
323  pixelssum1, pixelssum2, pixelssum3, pixelssum4;
324  register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
325  register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
326  register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
327 
328  temp1 = vec_ld(0, pixels);
329  temp2 = vec_ld(16, pixels);
330  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
331  if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
332  pixelsv2 = temp2;
333  } else {
334  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
335  }
336  pixelsv3 = vec_mergel(vczero, pixelsv1);
337  pixelsv4 = vec_mergel(vczero, pixelsv2);
338  pixelsv1 = vec_mergeh(vczero, pixelsv1);
339  pixelsv2 = vec_mergeh(vczero, pixelsv2);
340  pixelssum3 = vec_add((vector unsigned short)pixelsv3,
341  (vector unsigned short)pixelsv4);
342  pixelssum3 = vec_add(pixelssum3, vcone);
343  pixelssum1 = vec_add((vector unsigned short)pixelsv1,
344  (vector unsigned short)pixelsv2);
345  pixelssum1 = vec_add(pixelssum1, vcone);
346 
347  for (i = 0; i < h ; i++) {
348  blockv = vec_ld(0, block);
349 
350  temp1 = vec_ld(line_size, pixels);
351  temp2 = vec_ld(line_size + 16, pixels);
352  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
353  if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
354  pixelsv2 = temp2;
355  } else {
356  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
357  }
358 
359  pixelsv3 = vec_mergel(vczero, pixelsv1);
360  pixelsv4 = vec_mergel(vczero, pixelsv2);
361  pixelsv1 = vec_mergeh(vczero, pixelsv1);
362  pixelsv2 = vec_mergeh(vczero, pixelsv2);
363 
364  pixelssum4 = vec_add((vector unsigned short)pixelsv3,
365  (vector unsigned short)pixelsv4);
366  pixelssum2 = vec_add((vector unsigned short)pixelsv1,
367  (vector unsigned short)pixelsv2);
368  temp4 = vec_add(pixelssum3, pixelssum4);
369  temp4 = vec_sra(temp4, vctwo);
370  temp3 = vec_add(pixelssum1, pixelssum2);
371  temp3 = vec_sra(temp3, vctwo);
372 
373  pixelssum3 = vec_add(pixelssum4, vcone);
374  pixelssum1 = vec_add(pixelssum2, vcone);
375 
376  blockv = vec_packsu(temp3, temp4);
377 
378  vec_st(blockv, 0, block);
379 
380  block += line_size;
381  pixels += line_size;
382  }
383 }
384 
385 /* next one assumes that ((line_size % 8) == 0) */
386 static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
387 {
388  register int i;
389  register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
390  register vector unsigned char blockv, temp1, temp2, blocktemp;
391  register vector unsigned short pixelssum1, pixelssum2, temp3;
392 
393  register const vector unsigned char vczero = (const vector unsigned char)
394  vec_splat_u8(0);
395  register const vector unsigned short vctwo = (const vector unsigned short)
396  vec_splat_u16(2);
397 
398  temp1 = vec_ld(0, pixels);
399  temp2 = vec_ld(16, pixels);
400  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
401  if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
402  pixelsv2 = temp2;
403  } else {
404  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
405  }
406  pixelsv1 = vec_mergeh(vczero, pixelsv1);
407  pixelsv2 = vec_mergeh(vczero, pixelsv2);
408  pixelssum1 = vec_add((vector unsigned short)pixelsv1,
409  (vector unsigned short)pixelsv2);
410  pixelssum1 = vec_add(pixelssum1, vctwo);
411 
412  for (i = 0; i < h ; i++) {
413  int rightside = ((unsigned long)block & 0x0000000F);
414  blockv = vec_ld(0, block);
415 
416  temp1 = vec_ld(line_size, pixels);
417  temp2 = vec_ld(line_size + 16, pixels);
418  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
419  if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
420  pixelsv2 = temp2;
421  } else {
422  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
423  }
424 
425  pixelsv1 = vec_mergeh(vczero, pixelsv1);
426  pixelsv2 = vec_mergeh(vczero, pixelsv2);
427  pixelssum2 = vec_add((vector unsigned short)pixelsv1,
428  (vector unsigned short)pixelsv2);
429  temp3 = vec_add(pixelssum1, pixelssum2);
430  temp3 = vec_sra(temp3, vctwo);
431  pixelssum1 = vec_add(pixelssum2, vctwo);
432  pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
433 
434  if (rightside) {
435  blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
436  } else {
437  blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
438  }
439 
440  blockv = vec_avg(blocktemp, blockv);
441  vec_st(blockv, 0, block);
442 
443  block += line_size;
444  pixels += line_size;
445  }
446 }
447 #endif /* HAVE_ALTIVEC */
448 
450 {
451 #if HAVE_ALTIVEC
452  int mm_flags = av_get_cpu_flags();
453 
454  if (mm_flags & AV_CPU_FLAG_ALTIVEC) {
456  c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
457  c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
458 
460  c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
461  c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
462 
464  c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
465  c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
466  }
467 #endif /* HAVE_ALTIVEC */
468 }