00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00026 #include "libavcodec/dsputil.h"
00027 #include "dsputil_mmx.h"
00028 #include "vp3dsp_sse2.h"
00029
00030 DECLARE_ALIGNED(16, const uint16_t, ff_vp3_idct_data)[7 * 8] =
00031 {
00032 64277,64277,64277,64277,64277,64277,64277,64277,
00033 60547,60547,60547,60547,60547,60547,60547,60547,
00034 54491,54491,54491,54491,54491,54491,54491,54491,
00035 46341,46341,46341,46341,46341,46341,46341,46341,
00036 36410,36410,36410,36410,36410,36410,36410,36410,
00037 25080,25080,25080,25080,25080,25080,25080,25080,
00038 12785,12785,12785,12785,12785,12785,12785,12785
00039 };
00040
00041
00042 #define VP3_1D_IDCT_SSE2(ADD, SHIFT) \
00043 "movdqa "I(3)", %%xmm2 \n\t" \
00044 "movdqa "C(3)", %%xmm6 \n\t" \
00045 "movdqa %%xmm2, %%xmm4 \n\t" \
00046 "movdqa "I(5)", %%xmm7 \n\t" \
00047 "pmulhw %%xmm6, %%xmm4 \n\t" \
00048 "movdqa "C(5)", %%xmm1 \n\t" \
00049 "pmulhw %%xmm7, %%xmm6 \n\t" \
00050 "movdqa %%xmm1, %%xmm5 \n\t" \
00051 "pmulhw %%xmm2, %%xmm1 \n\t" \
00052 "movdqa "I(1)", %%xmm3 \n\t" \
00053 "pmulhw %%xmm7, %%xmm5 \n\t" \
00054 "movdqa "C(1)", %%xmm0 \n\t" \
00055 "paddw %%xmm2, %%xmm4 \n\t" \
00056 "paddw %%xmm7, %%xmm6 \n\t" \
00057 "paddw %%xmm1, %%xmm2 \n\t" \
00058 "movdqa "I(7)", %%xmm1 \n\t" \
00059 "paddw %%xmm5, %%xmm7 \n\t" \
00060 "movdqa %%xmm0, %%xmm5 \n\t" \
00061 "pmulhw %%xmm3, %%xmm0 \n\t" \
00062 "paddsw %%xmm7, %%xmm4 \n\t" \
00063 "pmulhw %%xmm1, %%xmm5 \n\t" \
00064 "movdqa "C(7)", %%xmm7 \n\t" \
00065 "psubsw %%xmm2, %%xmm6 \n\t" \
00066 "paddw %%xmm3, %%xmm0 \n\t" \
00067 "pmulhw %%xmm7, %%xmm3 \n\t" \
00068 "movdqa "I(2)", %%xmm2 \n\t" \
00069 "pmulhw %%xmm1, %%xmm7 \n\t" \
00070 "paddw %%xmm1, %%xmm5 \n\t" \
00071 "movdqa %%xmm2, %%xmm1 \n\t" \
00072 "pmulhw "C(2)", %%xmm2 \n\t" \
00073 "psubsw %%xmm5, %%xmm3 \n\t" \
00074 "movdqa "I(6)", %%xmm5 \n\t" \
00075 "paddsw %%xmm7, %%xmm0 \n\t" \
00076 "movdqa %%xmm5, %%xmm7 \n\t" \
00077 "psubsw %%xmm4, %%xmm0 \n\t" \
00078 "pmulhw "C(2)", %%xmm5 \n\t" \
00079 "paddw %%xmm1, %%xmm2 \n\t" \
00080 "pmulhw "C(6)", %%xmm1 \n\t" \
00081 "paddsw %%xmm4, %%xmm4 \n\t" \
00082 "paddsw %%xmm0, %%xmm4 \n\t" \
00083 "psubsw %%xmm6, %%xmm3 \n\t" \
00084 "paddw %%xmm7, %%xmm5 \n\t" \
00085 "paddsw %%xmm6, %%xmm6 \n\t" \
00086 "pmulhw "C(6)", %%xmm7 \n\t" \
00087 "paddsw %%xmm3, %%xmm6 \n\t" \
00088 "movdqa %%xmm4, "I(1)" \n\t" \
00089 "psubsw %%xmm5, %%xmm1 \n\t" \
00090 "movdqa "C(4)", %%xmm4 \n\t" \
00091 "movdqa %%xmm3, %%xmm5 \n\t" \
00092 "pmulhw %%xmm4, %%xmm3 \n\t" \
00093 "paddsw %%xmm2, %%xmm7 \n\t" \
00094 "movdqa %%xmm6, "I(2)" \n\t" \
00095 "movdqa %%xmm0, %%xmm2 \n\t" \
00096 "movdqa "I(0)", %%xmm6 \n\t" \
00097 "pmulhw %%xmm4, %%xmm0 \n\t" \
00098 "paddw %%xmm3, %%xmm5 \n\t" \
00099 "movdqa "I(4)", %%xmm3 \n\t" \
00100 "psubsw %%xmm1, %%xmm5 \n\t" \
00101 "paddw %%xmm0, %%xmm2 \n\t" \
00102 "psubsw %%xmm3, %%xmm6 \n\t" \
00103 "movdqa %%xmm6, %%xmm0 \n\t" \
00104 "pmulhw %%xmm4, %%xmm6 \n\t" \
00105 "paddsw %%xmm3, %%xmm3 \n\t" \
00106 "paddsw %%xmm1, %%xmm1 \n\t" \
00107 "paddsw %%xmm0, %%xmm3 \n\t" \
00108 "paddsw %%xmm5, %%xmm1 \n\t" \
00109 "pmulhw %%xmm3, %%xmm4 \n\t" \
00110 "paddw %%xmm0, %%xmm6 \n\t" \
00111 "psubsw %%xmm2, %%xmm6 \n\t" \
00112 "paddsw %%xmm2, %%xmm2 \n\t" \
00113 "movdqa "I(1)", %%xmm0 \n\t" \
00114 "paddsw %%xmm6, %%xmm2 \n\t" \
00115 "paddw %%xmm3, %%xmm4 \n\t" \
00116 "psubsw %%xmm1, %%xmm2 \n\t" \
00117 ADD(%%xmm2) \
00118 "paddsw %%xmm1, %%xmm1 \n\t" \
00119 "paddsw %%xmm2, %%xmm1 \n\t" \
00120 SHIFT(%%xmm2) \
00121 "psubsw %%xmm7, %%xmm4 \n\t" \
00122 SHIFT(%%xmm1) \
00123 "movdqa "I(2)", %%xmm3 \n\t" \
00124 "paddsw %%xmm7, %%xmm7 \n\t" \
00125 "paddsw %%xmm4, %%xmm7 \n\t" \
00126 "psubsw %%xmm3, %%xmm4 \n\t" \
00127 ADD(%%xmm4) \
00128 "paddsw %%xmm3, %%xmm3 \n\t" \
00129 "paddsw %%xmm4, %%xmm3 \n\t" \
00130 SHIFT(%%xmm4) \
00131 "psubsw %%xmm5, %%xmm6 \n\t" \
00132 SHIFT(%%xmm3) \
00133 ADD(%%xmm6) \
00134 "paddsw %%xmm5, %%xmm5 \n\t" \
00135 "paddsw %%xmm6, %%xmm5 \n\t" \
00136 SHIFT(%%xmm6) \
00137 SHIFT(%%xmm5) \
00138 "psubsw %%xmm0, %%xmm7 \n\t" \
00139 ADD(%%xmm7) \
00140 "paddsw %%xmm0, %%xmm0 \n\t" \
00141 "paddsw %%xmm7, %%xmm0 \n\t" \
00142 SHIFT(%%xmm7) \
00143 SHIFT(%%xmm0)
00144
00145 #define PUT_BLOCK(r0, r1, r2, r3, r4, r5, r6, r7) \
00146 "movdqa " #r0 ", " O(0) "\n\t" \
00147 "movdqa " #r1 ", " O(1) "\n\t" \
00148 "movdqa " #r2 ", " O(2) "\n\t" \
00149 "movdqa " #r3 ", " O(3) "\n\t" \
00150 "movdqa " #r4 ", " O(4) "\n\t" \
00151 "movdqa " #r5 ", " O(5) "\n\t" \
00152 "movdqa " #r6 ", " O(6) "\n\t" \
00153 "movdqa " #r7 ", " O(7) "\n\t"
00154
00155 #define NOP(xmm)
00156 #define SHIFT4(xmm) "psraw $4, "#xmm"\n\t"
00157 #define ADD8(xmm) "paddsw %2, "#xmm"\n\t"
00158
00159 void ff_vp3_idct_sse2(int16_t *input_data)
00160 {
00161 #define I(x) AV_STRINGIFY(16*x)"(%0)"
00162 #define O(x) I(x)
00163 #define C(x) AV_STRINGIFY(16*(x-1))"(%1)"
00164
00165 __asm__ volatile (
00166 VP3_1D_IDCT_SSE2(NOP, NOP)
00167
00168 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%0))
00169 PUT_BLOCK(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)
00170
00171 VP3_1D_IDCT_SSE2(ADD8, SHIFT4)
00172 PUT_BLOCK(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
00173 :: "r"(input_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8)
00174 );
00175 }
00176
00177 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block)
00178 {
00179 ff_vp3_idct_sse2(block);
00180 put_signed_pixels_clamped_mmx(block, dest, line_size);
00181 }
00182
00183 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block)
00184 {
00185 ff_vp3_idct_sse2(block);
00186 add_pixels_clamped_mmx(block, dest, line_size);
00187 }