FFmpeg
swscale_altivec.c
Go to the documentation of this file.
1 /*
2  * AltiVec-enhanced yuv2yuvX
3  *
4  * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5  * based on the equivalent C code in swscale.c
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include <inttypes.h>
25 
26 #include "config.h"
27 #include "libswscale/swscale.h"
29 #include "libavutil/attributes.h"
30 #include "libavutil/cpu.h"
31 #include "yuv2rgb_altivec.h"
33 
34 #if HAVE_ALTIVEC
35 #if HAVE_BIGENDIAN
36 #define vzero vec_splat_s32(0)
37 
38 #define GET_LS(a,b,c,s) {\
39  vector signed short l2 = vec_ld(((b) << 1) + 16, s);\
40  ls = vec_perm(a, l2, c);\
41  a = l2;\
42  }
43 
44 #define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {\
45  vector signed short ls;\
46  vector signed int vf1, vf2, i1, i2;\
47  GET_LS(l1, x, perm, src);\
48  i1 = vec_mule(filter, ls);\
49  i2 = vec_mulo(filter, ls);\
50  vf1 = vec_mergeh(i1, i2);\
51  vf2 = vec_mergel(i1, i2);\
52  d1 = vec_add(d1, vf1);\
53  d2 = vec_add(d2, vf2);\
54  } while (0)
55 
56 #define LOAD_FILTER(vf,f) {\
57  vector unsigned char perm0 = vec_lvsl(joffset, f);\
58  vf = vec_ld(joffset, f);\
59  vf = vec_perm(vf, vf, perm0);\
60 }
61 #define LOAD_L1(ll1,s,p){\
62  p = vec_lvsl(xoffset, s);\
63  ll1 = vec_ld(xoffset, s);\
64 }
65 
66 // The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).
67 
68 // The neat trick: We only care for half the elements,
69 // high or low depending on (i<<3)%16 (it's 0 or 8 here),
70 // and we're going to use vec_mule, so we choose
71 // carefully how to "unpack" the elements into the even slots.
72 #define GET_VF4(a, vf, f) {\
73  vf = vec_ld(a<< 3, f);\
74  if ((a << 3) % 16)\
75  vf = vec_mergel(vf, (vector signed short)vzero);\
76  else\
77  vf = vec_mergeh(vf, (vector signed short)vzero);\
78 }
79 #define FIRST_LOAD(sv, pos, s, per) {\
80  sv = vec_ld(pos, s);\
81  per = vec_lvsl(pos, s);\
82 }
83 #define UPDATE_PTR(s0, d0, s1, d1) {\
84  d0 = s0;\
85  d1 = s1;\
86 }
87 #define LOAD_SRCV(pos, a, s, per, v0, v1, vf) {\
88  v1 = vec_ld(pos + a + 16, s);\
89  vf = vec_perm(v0, v1, per);\
90 }
91 #define LOAD_SRCV8(pos, a, s, per, v0, v1, vf) {\
92  if ((((uintptr_t)s + pos) % 16) > 8) {\
93  v1 = vec_ld(pos + a + 16, s);\
94  }\
95  vf = vec_perm(v0, src_v1, per);\
96 }
97 #define GET_VFD(a, b, f, vf0, vf1, per, vf, off) {\
98  vf1 = vec_ld((a * 2 * filterSize) + (b * 2) + 16 + off, f);\
99  vf = vec_perm(vf0, vf1, per);\
100 }
101 
102 #define FUNC(name) name ## _altivec
103 #include "swscale_ppc_template.c"
104 #undef FUNC
105 
106 #undef vzero
107 
108 #endif /* HAVE_BIGENDIAN */
109 
110 #define SHIFT 3
111 
112 #define get_pixel(val, bias, signedness) \
113  (bias + av_clip_ ## signedness ## 16(val >> shift))
114 
115 static void
116 yuv2plane1_float_u(const int32_t *src, float *dest, int dstW, int start)
117 {
118  static const int shift = 3;
119  static const float float_mult = 1.0f / 65535.0f;
120  int i, val;
121  uint16_t val_uint;
122 
123  for (i = start; i < dstW; ++i){
124  val = src[i] + (1 << (shift - 1));
125  val_uint = get_pixel(val, 0, uint);
126  dest[i] = float_mult * (float)val_uint;
127  }
128 }
129 
130 static void
131 yuv2plane1_float_bswap_u(const int32_t *src, uint32_t *dest, int dstW, int start)
132 {
133  static const int shift = 3;
134  static const float float_mult = 1.0f / 65535.0f;
135  int i, val;
136  uint16_t val_uint;
137 
138  for (i = start; i < dstW; ++i){
139  val = src[i] + (1 << (shift - 1));
140  val_uint = get_pixel(val, 0, uint);
141  dest[i] = av_bswap32(av_float2int(float_mult * (float)val_uint));
142  }
143 }
144 
145 static void yuv2plane1_float_altivec(const int32_t *src, float *dest, int dstW)
146 {
147  const int dst_u = -(uintptr_t)dest & 3;
148  const int add = (1 << (SHIFT - 1));
149  const int clip = (1 << 16) - 1;
150  const float fmult = 1.0f / 65535.0f;
151  const vec_u32 vadd = (vec_u32) {add, add, add, add};
152  const vec_u32 vshift = (vec_u32) vec_splat_u32(SHIFT);
153  const vec_u32 vlargest = (vec_u32) {clip, clip, clip, clip};
154  const vec_f vmul = (vec_f) {fmult, fmult, fmult, fmult};
155  const vec_f vzero = (vec_f) {0, 0, 0, 0};
156  vec_u32 v;
157  vec_f vd;
158  int i;
159 
160  yuv2plane1_float_u(src, dest, dst_u, 0);
161 
162  for (i = dst_u; i < dstW - 3; i += 4) {
163  v = vec_ld(0, (const uint32_t *) &src[i]);
164  v = vec_add(v, vadd);
165  v = vec_sr(v, vshift);
166  v = vec_min(v, vlargest);
167 
168  vd = vec_ctf(v, 0);
169  vd = vec_madd(vd, vmul, vzero);
170 
171  vec_st(vd, 0, &dest[i]);
172  }
173 
174  yuv2plane1_float_u(src, dest, dstW, i);
175 }
176 
177 static void yuv2plane1_float_bswap_altivec(const int32_t *src, uint32_t *dest, int dstW)
178 {
179  const int dst_u = -(uintptr_t)dest & 3;
180  const int add = (1 << (SHIFT - 1));
181  const int clip = (1 << 16) - 1;
182  const float fmult = 1.0f / 65535.0f;
183  const vec_u32 vadd = (vec_u32) {add, add, add, add};
184  const vec_u32 vshift = (vec_u32) vec_splat_u32(SHIFT);
185  const vec_u32 vlargest = (vec_u32) {clip, clip, clip, clip};
186  const vec_f vmul = (vec_f) {fmult, fmult, fmult, fmult};
187  const vec_f vzero = (vec_f) {0, 0, 0, 0};
188  const vec_u32 vswapbig = (vec_u32) {16, 16, 16, 16};
189  const vec_u16 vswapsmall = vec_splat_u16(8);
190  vec_u32 v;
191  vec_f vd;
192  int i;
193 
194  yuv2plane1_float_bswap_u(src, dest, dst_u, 0);
195 
196  for (i = dst_u; i < dstW - 3; i += 4) {
197  v = vec_ld(0, (const uint32_t *) &src[i]);
198  v = vec_add(v, vadd);
199  v = vec_sr(v, vshift);
200  v = vec_min(v, vlargest);
201 
202  vd = vec_ctf(v, 0);
203  vd = vec_madd(vd, vmul, vzero);
204 
205  vd = (vec_f) vec_rl((vec_u32) vd, vswapbig);
206  vd = (vec_f) vec_rl((vec_u16) vd, vswapsmall);
207 
208  vec_st(vd, 0, (float *) &dest[i]);
209  }
210 
211  yuv2plane1_float_bswap_u(src, dest, dstW, i);
212 }
213 
214 #define yuv2plane1_float(template, dest_type, BE_LE) \
215 static void yuv2plane1_float ## BE_LE ## _altivec(const int16_t *src, uint8_t *dest, \
216  int dstW, \
217  const uint8_t *dither, int offset) \
218 { \
219  template((const int32_t *)src, (dest_type *)dest, dstW); \
220 }
221 
222 #if HAVE_BIGENDIAN
223 yuv2plane1_float(yuv2plane1_float_altivec, float, BE)
224 yuv2plane1_float(yuv2plane1_float_bswap_altivec, uint32_t, LE)
225 #else
226 yuv2plane1_float(yuv2plane1_float_altivec, float, LE)
227 yuv2plane1_float(yuv2plane1_float_bswap_altivec, uint32_t, BE)
228 #endif
229 
230 #endif /* HAVE_ALTIVEC */
231 
233 {
234 #if HAVE_ALTIVEC
235  enum AVPixelFormat dstFormat = c->dstFormat;
236 
238  return;
239 
240 #if HAVE_BIGENDIAN
241  if (c->srcBpc == 8 && c->dstBpc <= 14) {
242  c->hyScale = c->hcScale = hScale_real_altivec;
243  }
244  if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat) &&
245  dstFormat != AV_PIX_FMT_GRAYF32BE && dstFormat != AV_PIX_FMT_GRAYF32LE &&
246  !c->needAlpha) {
247  c->yuv2planeX = yuv2planeX_altivec;
248  }
249 #endif
250 
251  if (dstFormat == AV_PIX_FMT_GRAYF32BE) {
252  c->yuv2plane1 = yuv2plane1_floatBE_altivec;
253  } else if (dstFormat == AV_PIX_FMT_GRAYF32LE) {
254  c->yuv2plane1 = yuv2plane1_floatLE_altivec;
255  }
256 
257  /* The following list of supported dstFormat values should
258  * match what's found in the body of ff_yuv2packedX_altivec() */
259  if (!(c->flags & (SWS_BITEXACT | SWS_FULL_CHR_H_INT)) && !c->needAlpha) {
260  switch (c->dstFormat) {
261  case AV_PIX_FMT_ABGR:
262  c->yuv2packedX = ff_yuv2abgr_X_altivec;
263  break;
264  case AV_PIX_FMT_BGRA:
265  c->yuv2packedX = ff_yuv2bgra_X_altivec;
266  break;
267  case AV_PIX_FMT_ARGB:
268  c->yuv2packedX = ff_yuv2argb_X_altivec;
269  break;
270  case AV_PIX_FMT_RGBA:
271  c->yuv2packedX = ff_yuv2rgba_X_altivec;
272  break;
273  case AV_PIX_FMT_BGR24:
274  c->yuv2packedX = ff_yuv2bgr24_X_altivec;
275  break;
276  case AV_PIX_FMT_RGB24:
277  c->yuv2packedX = ff_yuv2rgb24_X_altivec;
278  break;
279  }
280  }
281 #endif /* HAVE_ALTIVEC */
282 
284 }
AVPixelFormat
AVPixelFormat
Pixel format.
Definition: pixfmt.h:71
SHIFT
#define SHIFT
Definition: median_template.c:40
AV_PIX_FMT_BGR24
@ AV_PIX_FMT_BGR24
packed RGB 8:8:8, 24bpp, BGRBGR...
Definition: pixfmt.h:76
AV_PIX_FMT_BGRA
@ AV_PIX_FMT_BGRA
packed BGRA 8:8:8:8, 32bpp, BGRABGRA...
Definition: pixfmt.h:102
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:107
av_float2int
static av_always_inline uint32_t av_float2int(float f)
Reinterpret a float as a 32-bit integer.
Definition: intfloat.h:50
AV_PIX_FMT_GRAYF32LE
@ AV_PIX_FMT_GRAYF32LE
IEEE-754 single precision Y, 32bpp, little-endian.
Definition: pixfmt.h:364
is16BPS
static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:721
SWS_BITEXACT
#define SWS_BITEXACT
Definition: swscale.h:115
val
static double val(void *priv, double ch)
Definition: aeval.c:77
isNBPS
static av_always_inline int isNBPS(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:735
av_cold
#define av_cold
Definition: attributes.h:90
clip
clip
Definition: af_crystalizer.c:122
float
float
Definition: af_crystalizer.c:122
AV_PIX_FMT_RGBA
@ AV_PIX_FMT_RGBA
packed RGBA 8:8:8:8, 32bpp, RGBARGBA...
Definition: pixfmt.h:100
isSemiPlanarYUV
static av_always_inline int isSemiPlanarYUV(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:767
yuv2plane1_float
yuv2plane1_float(yuv2plane1_float_c_template, yuv2plane1_float(float, LE)
Definition: output.c:311
AV_CPU_FLAG_ALTIVEC
#define AV_CPU_FLAG_ALTIVEC
standard
Definition: cpu.h:61
AV_PIX_FMT_ABGR
@ AV_PIX_FMT_ABGR
packed ABGR 8:8:8:8, 32bpp, ABGRABGR...
Definition: pixfmt.h:101
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
AV_PIX_FMT_RGB24
@ AV_PIX_FMT_RGB24
packed RGB 8:8:8, 24bpp, RGBRGB...
Definition: pixfmt.h:75
vec_u32
#define vec_u32
Definition: util_altivec.h:38
shift
static int shift(int a, int b)
Definition: bonk.c:261
cpu.h
av_bswap32
#define av_bswap32
Definition: bswap.h:47
SWS_FULL_CHR_H_INT
#define SWS_FULL_CHR_H_INT
Perform full chroma upsampling when upscaling to RGB.
Definition: swscale.h:97
yuv2rgb_altivec.h
attributes.h
ff_sws_init_swscale_vsx
av_cold void ff_sws_init_swscale_vsx(SwsContext *c)
Definition: swscale_vsx.c:2019
AV_PIX_FMT_ARGB
@ AV_PIX_FMT_ARGB
packed ARGB 8:8:8:8, 32bpp, ARGBARGB...
Definition: pixfmt.h:99
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
swscale_internal.h
AV_PIX_FMT_GRAYF32BE
@ AV_PIX_FMT_GRAYF32BE
IEEE-754 single precision Y, 32bpp, big-endian.
Definition: pixfmt.h:363
ff_sws_init_swscale_ppc
av_cold void ff_sws_init_swscale_ppc(SwsContext *c)
Definition: swscale_altivec.c:232
swscale_ppc_template.c
util_altivec.h
int32_t
int32_t
Definition: audioconvert.c:56
SwsContext
Definition: swscale_internal.h:324
vec_f
#define vec_f
Definition: util_altivec.h:40
src
#define src
Definition: vp8dsp.c:248
vec_u16
#define vec_u16
Definition: util_altivec.h:36
swscale.h