FFmpeg
swscale_ppc_template.c
Go to the documentation of this file.
1 /*
2  * AltiVec-enhanced yuv2yuvX
3  *
4  * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5  * based on the equivalent C code in swscale.c
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize,
25  const int16_t **src, uint8_t *dest,
26  const uint8_t *dither, int offset, int x)
27 {
28  register int i, j;
29  LOCAL_ALIGNED(16, int, val, [16]);
30  vector signed int vo1, vo2, vo3, vo4;
31  vector unsigned short vs1, vs2;
32  vector unsigned char vf;
33  vector unsigned int altivec_vectorShiftInt19 =
34  vec_add(vec_splat_u32(10), vec_splat_u32(9));
35 
36  for (i = 0; i < 16; i++)
37  val[i] = dither[(x + i + offset) & 7] << 12;
38 
39  vo1 = vec_ld(0, val);
40  vo2 = vec_ld(16, val);
41  vo3 = vec_ld(32, val);
42  vo4 = vec_ld(48, val);
43 
44  for (j = 0; j < filterSize; j++) {
45  unsigned int joffset=j<<1;
46  unsigned int xoffset=x<<1;
47  vector unsigned char av_unused perm;
48  vector signed short l1,vLumFilter;
49  LOAD_FILTER(vLumFilter,filter);
50  vLumFilter = vec_splat(vLumFilter, 0);
51  LOAD_L1(l1,src[j],perm);
52  yuv2planeX_8(vo1, vo2, l1, src[j], x, perm, vLumFilter);
53  yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter);
54  }
55 
56  vo1 = vec_sra(vo1, altivec_vectorShiftInt19);
57  vo2 = vec_sra(vo2, altivec_vectorShiftInt19);
58  vo3 = vec_sra(vo3, altivec_vectorShiftInt19);
59  vo4 = vec_sra(vo4, altivec_vectorShiftInt19);
60  vs1 = vec_packsu(vo1, vo2);
61  vs2 = vec_packsu(vo3, vo4);
62  vf = vec_packsu(vs1, vs2);
63  VEC_ST(vf, 0, dest);
64 }
65 
66 
67 static inline void yuv2planeX_u(const int16_t *filter, int filterSize,
68  const int16_t **src, uint8_t *dest, int dstW,
69  const uint8_t *dither, int offset, int x)
70 {
71  int i, j;
72 
73  for (i = x; i < dstW; i++) {
74  int t = dither[(i + offset) & 7] << 12;
75  for (j = 0; j < filterSize; j++)
76  t += src[j][i] * filter[j];
77  dest[i] = av_clip_uint8(t >> 19);
78  }
79 }
80 
81 static void FUNC(yuv2planeX)(const int16_t *filter, int filterSize,
82  const int16_t **src, uint8_t *dest, int dstW,
83  const uint8_t *dither, int offset)
84 {
85  int dst_u = -(uintptr_t)dest & 15;
86  int i;
87 
88  yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
89 
90  for (i = dst_u; i < dstW - 15; i += 16)
91  FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither,
92  offset, i);
93 
94  yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
95 }
96 
97 static void FUNC(hScale_real)(SwsContext *c, int16_t *dst, int dstW,
98  const uint8_t *src, const int16_t *filter,
99  const int32_t *filterPos, int filterSize)
100 {
101  register int i;
102  LOCAL_ALIGNED(16, int, tempo, [4]);
103 
104  if (filterSize % 4) {
105  for (i = 0; i < dstW; i++) {
106  register int j;
107  register int srcPos = filterPos[i];
108  register int val = 0;
109  for (j = 0; j < filterSize; j++)
110  val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
111  dst[i] = FFMIN(val >> 7, (1 << 15) - 1);
112  }
113  } else
114  switch (filterSize) {
115  case 4:
116  for (i = 0; i < dstW; i++) {
117  register int srcPos = filterPos[i];
118 
119  vector unsigned char src_vF = unaligned_load(srcPos, src);
120  vector signed short src_v, filter_v;
121  vector signed int val_vEven, val_s;
122  src_v = // vec_unpackh sign-extends...
123  (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
124  // now put our elements in the even slots
125  src_v = vec_mergeh(src_v, (vector signed short)vzero);
126  GET_VF4(i, filter_v, filter);
127  val_vEven = vec_mule(src_v, filter_v);
128  val_s = vec_sums(val_vEven, vzero);
129  vec_st(val_s, 0, tempo);
130  dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
131  }
132  break;
133  case 8:
134  for (i = 0; i < dstW; i++) {
135  register int srcPos = filterPos[i];
136  vector unsigned char src_vF, av_unused src_v0, av_unused src_v1;
137  vector unsigned char av_unused permS;
138  vector signed short src_v, filter_v;
139  vector signed int val_v, val_s;
140  FIRST_LOAD(src_v0, srcPos, src, permS);
141  LOAD_SRCV8(srcPos, 0, src, permS, src_v0, src_v1, src_vF);
142  src_v = // vec_unpackh sign-extends...
143  (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
144  filter_v = vec_ld(i << 4, filter);
145  val_v = vec_msums(src_v, filter_v, (vector signed int)vzero);
146  val_s = vec_sums(val_v, vzero);
147  vec_st(val_s, 0, tempo);
148  dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
149  }
150  break;
151 
152  case 16:
153  for (i = 0; i < dstW; i++) {
154  register int srcPos = filterPos[i];
155 
156  vector unsigned char src_vF = unaligned_load(srcPos, src);
157  vector signed short src_vA = // vec_unpackh sign-extends...
158  (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
159  vector signed short src_vB = // vec_unpackh sign-extends...
160  (vector signed short)(VEC_MERGEL((vector unsigned char)vzero, src_vF));
161  vector signed short filter_v0 = vec_ld(i << 5, filter);
162  vector signed short filter_v1 = vec_ld((i << 5) + 16, filter);
163 
164  vector signed int val_acc = vec_msums(src_vA, filter_v0, (vector signed int)vzero);
165  vector signed int val_v = vec_msums(src_vB, filter_v1, val_acc);
166 
167  vector signed int val_s = vec_sums(val_v, vzero);
168 
169  VEC_ST(val_s, 0, tempo);
170  dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
171  }
172  break;
173 
174  default:
175  for (i = 0; i < dstW; i++) {
176  register int j, av_unused offset = i * 2 * filterSize;
177  register int srcPos = filterPos[i];
178 
179  vector signed int val_s, val_v = (vector signed int)vzero;
180  vector signed short av_unused filter_v0R;
181  vector unsigned char av_unused permF, av_unused src_v0, av_unused permS;
182  FIRST_LOAD(filter_v0R, offset, filter, permF);
183  FIRST_LOAD(src_v0, srcPos, src, permS);
184 
185  for (j = 0; j < filterSize - 15; j += 16) {
186  vector unsigned char av_unused src_v1, src_vF;
187  vector signed short av_unused filter_v1R, av_unused filter_v2R,
188  filter_v0, filter_v1, src_vA, src_vB;
189  vector signed int val_acc;
190  LOAD_SRCV(srcPos, j, src, permS, src_v0, src_v1, src_vF);
191  src_vA = // vec_unpackh sign-extends...
192  (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
193  src_vB = // vec_unpackh sign-extends...
194  (vector signed short)(VEC_MERGEL((vector unsigned char)vzero, src_vF));
195  GET_VFD(i, j, filter, filter_v0R, filter_v1R, permF, filter_v0, 0);
196  GET_VFD(i, j, filter, filter_v1R, filter_v2R, permF, filter_v1, 16);
197 
198  val_acc = vec_msums(src_vA, filter_v0, val_v);
199  val_v = vec_msums(src_vB, filter_v1, val_acc);
200  UPDATE_PTR(filter_v2R, filter_v0R, src_v1, src_v0);
201  }
202 
203  if (j < filterSize - 7) {
204  // loading src_v0 is useless, it's already done above
205  vector unsigned char av_unused src_v1, src_vF;
206  vector signed short src_v, av_unused filter_v1R, filter_v;
207  LOAD_SRCV8(srcPos, j, src, permS, src_v0, src_v1, src_vF);
208  src_v = // vec_unpackh sign-extends...
209  (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
210  GET_VFD(i, j, filter, filter_v0R, filter_v1R, permF, filter_v, 0);
211  val_v = vec_msums(src_v, filter_v, val_v);
212  }
213  val_s = vec_sums(val_v, vzero);
214 
215  VEC_ST(val_s, 0, tempo);
216  dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
217  }
218  }
219 }
av_unused
#define av_unused
Definition: attributes.h:125
yuv2planeX
static void FUNC() yuv2planeX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset)
Definition: swscale_ppc_template.c:81
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
perm
perm
Definition: f_perms.c:74
src
#define src
Definition: vp8dsp.c:254
int32_t
int32_t
Definition: audio_convert.c:194
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
yuv2planeX_u
static void yuv2planeX_u(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset, int x)
Definition: swscale_ppc_template.c:67
val
const char const char void * val
Definition: avisynth_c.h:863
FFMIN
#define FFMIN(a, b)
Definition: common.h:96
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:259
uint8_t
uint8_t
Definition: audio_convert.c:194
FUNC
#define FUNC(a)
Definition: bit_depth_template.c:104
hScale_real
static void FUNC() hScale_real(SwsContext *c, int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
Definition: swscale_ppc_template.c:97
LOCAL_ALIGNED
#define LOCAL_ALIGNED(a, t, v,...)
Definition: internal.h:114
int
int
Definition: ffmpeg_filter.c:191
SwsContext
Definition: swscale_internal.h:280
short
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf default minimum maximum flags name is the option keep it simple and lowercase description are short
Definition: writing_filters.txt:89
yuv2planeX_8_16
static void FUNC() yuv2planeX_8_16(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, const uint8_t *dither, int offset, int x)
Definition: swscale_ppc_template.c:24
dither
static const uint8_t dither[8][8]
Definition: vf_fspp.c:57