00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00029 #include "libavcodec/rv34dsp.h"
00030 #include "libavutil/mem.h"
00031 #include "libavutil/x86/cpu.h"
00032 #include "dsputil_mmx.h"
00033
00034 #if HAVE_YASM
00035 void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
00036 int stride, int h, int x, int y);
00037 void ff_avg_rv40_chroma_mc8_mmx2 (uint8_t *dst, uint8_t *src,
00038 int stride, int h, int x, int y);
00039 void ff_avg_rv40_chroma_mc8_3dnow(uint8_t *dst, uint8_t *src,
00040 int stride, int h, int x, int y);
00041
00042 void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
00043 int stride, int h, int x, int y);
00044 void ff_avg_rv40_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
00045 int stride, int h, int x, int y);
00046 void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src,
00047 int stride, int h, int x, int y);
00048
00049 #define DECLARE_WEIGHT(opt) \
00050 void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
00051 int w1, int w2, ptrdiff_t stride); \
00052 void ff_rv40_weight_func_rnd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
00053 int w1, int w2, ptrdiff_t stride); \
00054 void ff_rv40_weight_func_nornd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
00055 int w1, int w2, ptrdiff_t stride); \
00056 void ff_rv40_weight_func_nornd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
00057 int w1, int w2, ptrdiff_t stride);
00058 DECLARE_WEIGHT(mmx2)
00059 DECLARE_WEIGHT(sse2)
00060 DECLARE_WEIGHT(ssse3)
00061
00070 #define QPEL_FUNC_DECL(OP, SIZE, PH, PV, OPT) \
00071 static void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst, \
00072 uint8_t *src, \
00073 int stride) \
00074 { \
00075 int i; \
00076 if (PH && PV) { \
00077 DECLARE_ALIGNED(16, uint8_t, tmp)[SIZE * (SIZE + 5)]; \
00078 uint8_t *tmpptr = tmp + SIZE * 2; \
00079 src -= stride * 2; \
00080 \
00081 for (i = 0; i < SIZE; i += LOOPSIZE) \
00082 ff_put_rv40_qpel_h ##OPT(tmp + i, SIZE, src + i, stride, \
00083 SIZE + 5, HCOFF(PH)); \
00084 for (i = 0; i < SIZE; i += LOOPSIZE) \
00085 ff_ ##OP ##rv40_qpel_v ##OPT(dst + i, stride, tmpptr + i, \
00086 SIZE, SIZE, VCOFF(PV)); \
00087 } else if (PV) { \
00088 for (i = 0; i < SIZE; i += LOOPSIZE) \
00089 ff_ ##OP ##rv40_qpel_v ## OPT(dst + i, stride, src + i, \
00090 stride, SIZE, VCOFF(PV)); \
00091 } else { \
00092 for (i = 0; i < SIZE; i += LOOPSIZE) \
00093 ff_ ##OP ##rv40_qpel_h ## OPT(dst + i, stride, src + i, \
00094 stride, SIZE, HCOFF(PH)); \
00095 } \
00096 };
00097
00100 #define QPEL_FUNCS_DECL(OP, PH, PV, OPT) \
00101 QPEL_FUNC_DECL(OP, 8, PH, PV, OPT) \
00102 QPEL_FUNC_DECL(OP, 16, PH, PV, OPT)
00103
00105 #define QPEL_MC_DECL(OP, OPT) \
00106 void ff_ ##OP ##rv40_qpel_h ##OPT(uint8_t *dst, ptrdiff_t dstStride, \
00107 const uint8_t *src, \
00108 ptrdiff_t srcStride, \
00109 int len, int m); \
00110 void ff_ ##OP ##rv40_qpel_v ##OPT(uint8_t *dst, ptrdiff_t dstStride, \
00111 const uint8_t *src, \
00112 ptrdiff_t srcStride, \
00113 int len, int m); \
00114 QPEL_FUNCS_DECL(OP, 0, 1, OPT) \
00115 QPEL_FUNCS_DECL(OP, 0, 3, OPT) \
00116 QPEL_FUNCS_DECL(OP, 1, 0, OPT) \
00117 QPEL_FUNCS_DECL(OP, 1, 1, OPT) \
00118 QPEL_FUNCS_DECL(OP, 1, 2, OPT) \
00119 QPEL_FUNCS_DECL(OP, 1, 3, OPT) \
00120 QPEL_FUNCS_DECL(OP, 2, 1, OPT) \
00121 QPEL_FUNCS_DECL(OP, 2, 2, OPT) \
00122 QPEL_FUNCS_DECL(OP, 2, 3, OPT) \
00123 QPEL_FUNCS_DECL(OP, 3, 0, OPT) \
00124 QPEL_FUNCS_DECL(OP, 3, 1, OPT) \
00125 QPEL_FUNCS_DECL(OP, 3, 2, OPT)
00126
00128 #define LOOPSIZE 8
00129 #define HCOFF(x) (32 * (x - 1))
00130 #define VCOFF(x) (32 * (x - 1))
00131 QPEL_MC_DECL(put_, _ssse3)
00132 QPEL_MC_DECL(avg_, _ssse3)
00133
00134 #undef LOOPSIZE
00135 #undef HCOFF
00136 #undef VCOFF
00137 #define LOOPSIZE 8
00138 #define HCOFF(x) (64 * (x - 1))
00139 #define VCOFF(x) (64 * (x - 1))
00140 QPEL_MC_DECL(put_, _sse2)
00141 QPEL_MC_DECL(avg_, _sse2)
00142
00143 #if ARCH_X86_32
00144 #undef LOOPSIZE
00145 #undef HCOFF
00146 #undef VCOFF
00147 #define LOOPSIZE 4
00148 #define HCOFF(x) (64 * (x - 1))
00149 #define VCOFF(x) (64 * (x - 1))
00150
00151 QPEL_MC_DECL(put_, _mmx)
00152
00153 #define ff_put_rv40_qpel_h_mmx2 ff_put_rv40_qpel_h_mmx
00154 #define ff_put_rv40_qpel_v_mmx2 ff_put_rv40_qpel_v_mmx
00155 QPEL_MC_DECL(avg_, _mmx2)
00156
00157 #define ff_put_rv40_qpel_h_3dnow ff_put_rv40_qpel_h_mmx
00158 #define ff_put_rv40_qpel_v_3dnow ff_put_rv40_qpel_v_mmx
00159 QPEL_MC_DECL(avg_, _3dnow)
00160 #endif
00161
00164 #define QPEL_FUNC_SET(OP, SIZE, PH, PV, OPT) \
00165 c-> OP ## pixels_tab[2 - SIZE / 8][4 * PV + PH] = OP ## rv40_qpel ##SIZE ## _mc ##PH ##PV ##OPT;
00166
00168 #define QPEL_FUNCS_SET(OP, PH, PV, OPT) \
00169 QPEL_FUNC_SET(OP, 8, PH, PV, OPT) \
00170 QPEL_FUNC_SET(OP, 16, PH, PV, OPT)
00171
00173 #define QPEL_MC_SET(OP, OPT) \
00174 QPEL_FUNCS_SET (OP, 0, 1, OPT) \
00175 QPEL_FUNCS_SET (OP, 0, 3, OPT) \
00176 QPEL_FUNCS_SET (OP, 1, 0, OPT) \
00177 QPEL_FUNCS_SET (OP, 1, 1, OPT) \
00178 QPEL_FUNCS_SET (OP, 1, 2, OPT) \
00179 QPEL_FUNCS_SET (OP, 1, 3, OPT) \
00180 QPEL_FUNCS_SET (OP, 2, 1, OPT) \
00181 QPEL_FUNCS_SET (OP, 2, 2, OPT) \
00182 QPEL_FUNCS_SET (OP, 2, 3, OPT) \
00183 QPEL_FUNCS_SET (OP, 3, 0, OPT) \
00184 QPEL_FUNCS_SET (OP, 3, 1, OPT) \
00185 QPEL_FUNCS_SET (OP, 3, 2, OPT)
00186
00188 #endif
00189
00190 void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
00191 {
00192 #if HAVE_YASM
00193 int mm_flags = av_get_cpu_flags();
00194
00195 if (EXTERNAL_MMX(mm_flags)) {
00196 c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx;
00197 c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx;
00198 #if HAVE_INLINE_ASM
00199 c->put_pixels_tab[0][15] = ff_put_rv40_qpel16_mc33_mmx;
00200 c->put_pixels_tab[1][15] = ff_put_rv40_qpel8_mc33_mmx;
00201 c->avg_pixels_tab[0][15] = ff_avg_rv40_qpel16_mc33_mmx;
00202 c->avg_pixels_tab[1][15] = ff_avg_rv40_qpel8_mc33_mmx;
00203 #endif
00204 #if ARCH_X86_32
00205 QPEL_MC_SET(put_, _mmx)
00206 #endif
00207 }
00208 if (EXTERNAL_MMXEXT(mm_flags)) {
00209 c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmx2;
00210 c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmx2;
00211 c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmx2;
00212 c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmx2;
00213 c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmx2;
00214 c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmx2;
00215 #if ARCH_X86_32
00216 QPEL_MC_SET(avg_, _mmx2)
00217 #endif
00218 } else if (EXTERNAL_AMD3DNOW(mm_flags)) {
00219 c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow;
00220 c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow;
00221 #if ARCH_X86_32
00222 QPEL_MC_SET(avg_, _3dnow)
00223 #endif
00224 }
00225 if (EXTERNAL_SSE2(mm_flags)) {
00226 c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
00227 c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2;
00228 c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2;
00229 c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2;
00230 QPEL_MC_SET(put_, _sse2)
00231 QPEL_MC_SET(avg_, _sse2)
00232 }
00233 if (EXTERNAL_SSSE3(mm_flags)) {
00234 c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3;
00235 c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3;
00236 c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3;
00237 c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3;
00238 QPEL_MC_SET(put_, _ssse3)
00239 QPEL_MC_SET(avg_, _ssse3)
00240 }
00241 #endif
00242 }