[FFmpeg-cvslog] r22565 - in trunk: configure libavcodec/Makefile libavcodec/arm/dsputil_init_neon.c libavcodec/arm/h264dsp_init_arm.c libavcodec/dsputil.c libavcodec/dsputil.h libavcodec/h264.c libavcodec/h264.h l...

Tue Mar 16 02:17:00 CET 2010

Author: mru
Date: Tue Mar 16 02:17:00 2010
New Revision: 22565

Log:
Move H264 dsputil functions into their own struct

This moves the H264-specific functions from DSPContext to the new
H264DSPContext.  The code is made conditional on CONFIG_H264DSP
which is set by the codecs requiring it.

The qpel and chroma MC functions are not moved as these are used by
non-h264 code.

Added:
   trunk/libavcodec/arm/h264dsp_init_arm.c
   trunk/libavcodec/h264dsp.c
      - copied, changed from r22564, trunk/libavcodec/dsputil.c
   trunk/libavcodec/h264dsp.h
      - copied, changed from r22564, trunk/libavcodec/dsputil.h
Modified:
   trunk/configure
   trunk/libavcodec/Makefile
   trunk/libavcodec/arm/dsputil_init_neon.c
   trunk/libavcodec/dsputil.c
   trunk/libavcodec/dsputil.h
   trunk/libavcodec/h264.c
   trunk/libavcodec/h264.h
   trunk/libavcodec/h264_loopfilter.c
   trunk/libavcodec/ppc/h264_altivec.c
   trunk/libavcodec/x86/dsputil_mmx.c

Modified: trunk/configure
==============================================================================

--- trunk/configure	Tue Mar 16 00:40:51 2010	(r22564)
+++ trunk/configure	Tue Mar 16 02:17:00 2010	(r22565)
@@ -907,6 +907,7 @@ CONFIG_LIST="
     gpl
     gprof
     gray
+    h264dsp
     hardcoded_tables
     libdc1394
     libdirac
@@ -1229,7 +1230,7 @@ h263_vaapi_hwaccel_deps="va_va_h"
 h263_vaapi_hwaccel_select="vaapi h263_decoder"
 h263i_decoder_select="h263_decoder"
 h263p_encoder_select="h263_encoder"
-h264_decoder_select="golomb"
+h264_decoder_select="golomb h264dsp"
 h264_dxva2_hwaccel_deps="dxva2api_h"
 h264_dxva2_hwaccel_select="dxva2 h264_decoder"
 h264_vaapi_hwaccel_deps="va_va_h"
@@ -1275,8 +1276,8 @@ rv10_decoder_select="h263_decoder"
 rv10_encoder_select="h263_encoder"
 rv20_decoder_select="h263_decoder"
 rv20_encoder_select="h263_encoder"
-rv30_decoder_select="golomb"
-rv40_decoder_select="golomb"
+rv30_decoder_select="golomb h264dsp"
+rv40_decoder_select="golomb h264dsp"
 shorten_decoder_select="golomb"
 sipr_decoder_select="lsp"
 snow_decoder_select="dwt"
@@ -1285,7 +1286,7 @@ sonic_decoder_select="golomb"
 sonic_encoder_select="golomb"
 sonic_ls_encoder_select="golomb"
 svq1_encoder_select="aandct"
-svq3_decoder_select="golomb"
+svq3_decoder_select="golomb h264dsp"
 svq3_decoder_suggest="zlib"
 theora_decoder_select="vp3_decoder"
 tiff_decoder_suggest="zlib"
@@ -1324,7 +1325,7 @@ zmbv_decoder_select="zlib"
 zmbv_encoder_select="zlib"
 
 # parsers
-h264_parser_select="golomb"
+h264_parser_select="golomb h264dsp"
 
 # bitstream_filters
 aac_adtstoasc_bsf_select="aac_parser"

Modified: trunk/libavcodec/Makefile
==============================================================================
--- trunk/libavcodec/Makefile	Tue Mar 16 00:40:51 2010	(r22564)
+++ trunk/libavcodec/Makefile	Tue Mar 16 02:17:00 2010	(r22565)
@@ -33,6 +33,7 @@ OBJS-$(CONFIG_DXVA2)                   +
 FFT-OBJS-$(CONFIG_HARDCODED_TABLES)    += cos_tables.o
 OBJS-$(CONFIG_FFT)                     += avfft.o fft.o $(FFT-OBJS-yes)
 OBJS-$(CONFIG_GOLOMB)                  += golomb.o
+OBJS-$(CONFIG_H264DSP)                 += h264dsp.o h264idct.o h264pred.o
 OBJS-$(CONFIG_LPC)                     += lpc.o
 OBJS-$(CONFIG_LSP)                     += lsp.o
 OBJS-$(CONFIG_MDCT)                    += mdct.o
@@ -146,7 +147,7 @@ OBJS-$(CONFIG_H263_ENCODER)            +
                                           ratecontrol.o h263.o ituh263enc.o \
                                           flvenc.o mpeg12data.o             \
                                           mpegvideo.o error_resilience.o
-OBJS-$(CONFIG_H264_DECODER)            += h264.o h264idct.o h264pred.o         \
+OBJS-$(CONFIG_H264_DECODER)            += h264.o                               \
                                           h264_loopfilter.o h264_direct.o      \
                                           cabac.o h264_sei.o h264_ps.o         \
                                           h264_refs.o h264_cavlc.o h264_cabac.o\
@@ -282,9 +283,9 @@ OBJS-$(CONFIG_RV10_DECODER)            +
 OBJS-$(CONFIG_RV10_ENCODER)            += rv10enc.o
 OBJS-$(CONFIG_RV20_DECODER)            += rv10.o
 OBJS-$(CONFIG_RV20_ENCODER)            += rv20enc.o
-OBJS-$(CONFIG_RV30_DECODER)            += rv30.o rv34.o h264pred.o rv30dsp.o \
+OBJS-$(CONFIG_RV30_DECODER)            += rv30.o rv34.o rv30dsp.o        \
                                           mpegvideo.o error_resilience.o
-OBJS-$(CONFIG_RV40_DECODER)            += rv40.o rv34.o h264pred.o rv40dsp.o \
+OBJS-$(CONFIG_RV40_DECODER)            += rv40.o rv34.o rv40dsp.o        \
                                           mpegvideo.o error_resilience.o
 OBJS-$(CONFIG_SGI_DECODER)             += sgidec.o
 OBJS-$(CONFIG_SGI_ENCODER)             += sgienc.o rle.o
@@ -315,7 +316,7 @@ OBJS-$(CONFIG_SVQ1_ENCODER)            +
                                           mpegvideo.o error_resilience.o \
                                           ituh263enc.o mpegvideo_enc.o   \
                                           ratecontrol.o mpeg12data.o
-OBJS-$(CONFIG_SVQ3_DECODER)            += h264.o svq3.o h264idct.o h264pred.o \
+OBJS-$(CONFIG_SVQ3_DECODER)            += h264.o svq3.o                       \
                                           h264_loopfilter.o h264_direct.o     \
                                           h264_sei.o h264_ps.o h264_refs.o    \
                                           h264_cavlc.o h264_cabac.o cabac.o   \
@@ -538,8 +539,8 @@ OBJS-$(CONFIG_DVBSUB_PARSER)           +
 OBJS-$(CONFIG_DVDSUB_PARSER)           += dvdsub_parser.o
 OBJS-$(CONFIG_H261_PARSER)             += h261_parser.o
 OBJS-$(CONFIG_H263_PARSER)             += h263_parser.o
-OBJS-$(CONFIG_H264_PARSER)             += h264_parser.o h264.o h264idct.o \
-                                          h264pred.o cabac.o              \
+OBJS-$(CONFIG_H264_PARSER)             += h264_parser.o h264.o            \
+                                          cabac.o                         \
                                           h264_refs.o h264_sei.o h264_direct.o \
                                           h264_loopfilter.o h264_cabac.o \
                                           h264_cavlc.o h264_ps.o \
@@ -631,13 +632,16 @@ OBJS-$(ARCH_ALPHA)                     +
                                           alpha/mpegvideo_alpha.o       \
                                           alpha/simple_idct_alpha.o     \
 
+ARM-OBJS-$(CONFIG_H264DSP)             += arm/h264dsp_init_arm.o        \
+                                          arm/h264pred_init_arm.o       \
+
 OBJS-$(ARCH_ARM)                       += arm/dsputil_init_arm.o        \
                                           arm/dsputil_arm.o             \
                                           arm/fft_init_arm.o            \
-                                          arm/h264pred_init_arm.o       \
                                           arm/jrevdct_arm.o             \
                                           arm/mpegvideo_arm.o           \
                                           arm/simple_idct_arm.o         \
+                                          $(ARM-OBJS-yes)
 
 OBJS-$(HAVE_ARMV5TE)                   += arm/dsputil_init_armv5te.o    \
                                           arm/mpegvideo_armv5te.o       \
@@ -658,7 +662,7 @@ NEON-OBJS-$(CONFIG_FFT)                +
 
 NEON-OBJS-$(CONFIG_MDCT)               += arm/mdct_neon.o               \
 
-NEON-OBJS-$(CONFIG_H264_DECODER)       += arm/h264dsp_neon.o            \
+NEON-OBJS-$(CONFIG_H264DSP)            += arm/h264dsp_neon.o            \
                                           arm/h264idct_neon.o           \
                                           arm/h264pred_neon.o           \
 
@@ -680,7 +684,7 @@ OBJS-$(ARCH_BFIN)                      +
 
 OBJS-$(ARCH_PPC)                       += ppc/dsputil_ppc.o             \
 
-ALTIVEC-OBJS-$(CONFIG_H264_DECODER)    += ppc/h264_altivec.o
+ALTIVEC-OBJS-$(CONFIG_H264DSP)         += ppc/h264_altivec.o
 ALTIVEC-OBJS-$(CONFIG_VC1_DECODER)     += ppc/vc1dsp_altivec.o
 ALTIVEC-OBJS-$(CONFIG_VP3_DECODER)     += ppc/vp3dsp_altivec.o
 ALTIVEC-OBJS-$(CONFIG_VP5_DECODER)     += ppc/vp3dsp_altivec.o

Modified: trunk/libavcodec/arm/dsputil_init_neon.c
==============================================================================
--- trunk/libavcodec/arm/dsputil_init_neon.c	Tue Mar 16 00:40:51 2010	(r22564)
+++ trunk/libavcodec/arm/dsputil_init_neon.c	Tue Mar 16 02:17:00 2010	(r22565)
@@ -131,69 +131,6 @@ void ff_avg_h264_chroma_mc8_neon(uint8_t
 void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
 void ff_avg_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int);
 
-void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
-                                     int beta, int8_t *tc0);
-void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
-                                     int beta, int8_t *tc0);
-void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
-                                       int beta, int8_t *tc0);
-void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
-                                       int beta, int8_t *tc0);
-
-void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den,
-                                      int weight, int offset);
-void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den,
-                                     int weight, int offset);
-void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den,
-                                     int weight, int offset);
-void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den,
-                                    int weight, int offset);
-void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den,
-                                    int weight, int offset);
-void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den,
-                                    int weight, int offset);
-void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den,
-                                    int weight, int offset);
-void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den,
-                                    int weight, int offset);
-
-void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride,
-                                        int log2_den, int weightd, int weights,
-                                        int offset);
-void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride,
-                                       int log2_den, int weightd, int weights,
-                                       int offset);
-void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride,
-                                       int log2_den, int weightd, int weights,
-                                       int offset);
-void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride,
-                                      int log2_den, int weightd, int weights,
-                                      int offset);
-void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride,
-                                      int log2_den, int weightd, int weights,
-                                      int offset);
-void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride,
-                                      int log2_den, int weightd, int weights,
-                                      int offset);
-void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride,
-                                      int log2_den, int weightd, int weights,
-                                      int offset);
-void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride,
-                                      int log2_den, int weightd, int weights,
-                                      int offset);
-
-void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
-void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
-void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
-                             DCTELEM *block, int stride,
-                             const uint8_t nnzc[6*8]);
-void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset,
-                                  DCTELEM *block, int stride,
-                                  const uint8_t nnzc[6*8]);
-void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
-                            DCTELEM *block, int stride,
-                            const uint8_t nnzc[6*8]);
-
 void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *);
 void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *);
 
@@ -352,35 +289,6 @@ void ff_dsputil_init_neon(DSPContext *c,
         c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon;
         c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon;
         c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon;
-
-        c->h264_v_loop_filter_luma   = ff_h264_v_loop_filter_luma_neon;
-        c->h264_h_loop_filter_luma   = ff_h264_h_loop_filter_luma_neon;
-        c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
-        c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
-
-        c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon;
-        c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon;
-        c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon;
-        c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon;
-        c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon;
-        c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon;
-        c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon;
-        c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon;
-
-        c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon;
-        c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon;
-        c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon;
-        c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon;
-        c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon;
-        c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon;
-        c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon;
-        c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon;
-
-        c->h264_idct_add        = ff_h264_idct_add_neon;
-        c->h264_idct_dc_add     = ff_h264_idct_dc_add_neon;
-        c->h264_idct_add16      = ff_h264_idct_add16_neon;
-        c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
-        c->h264_idct_add8       = ff_h264_idct_add8_neon;
     }
 
     if (CONFIG_VP3_DECODER) {

Added: trunk/libavcodec/arm/h264dsp_init_arm.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ trunk/libavcodec/arm/h264dsp_init_arm.c	Tue Mar 16 02:17:00 2010	(r22565)
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans at mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavcodec/dsputil.h"
+#include "libavcodec/h264dsp.h"
+
+void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
+                                     int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
+                                     int beta, int8_t *tc0);
+void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
+                                       int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
+                                       int beta, int8_t *tc0);
+
+void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den,
+                                      int weight, int offset);
+void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den,
+                                     int weight, int offset);
+void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den,
+                                     int weight, int offset);
+void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den,
+                                    int weight, int offset);
+void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den,
+                                    int weight, int offset);
+void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den,
+                                    int weight, int offset);
+void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den,
+                                    int weight, int offset);
+void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den,
+                                    int weight, int offset);
+
+void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride,
+                                        int log2_den, int weightd, int weights,
+                                        int offset);
+void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride,
+                                       int log2_den, int weightd, int weights,
+                                       int offset);
+void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride,
+                                       int log2_den, int weightd, int weights,
+                                       int offset);
+void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride,
+                                      int log2_den, int weightd, int weights,
+                                      int offset);
+void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride,
+                                      int log2_den, int weightd, int weights,
+                                      int offset);
+void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride,
+                                      int log2_den, int weightd, int weights,
+                                      int offset);
+void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride,
+                                      int log2_den, int weightd, int weights,
+                                      int offset);
+void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride,
+                                      int log2_den, int weightd, int weights,
+                                      int offset);
+
+void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
+void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
+void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
+                             DCTELEM *block, int stride,
+                             const uint8_t nnzc[6*8]);
+void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset,
+                                  DCTELEM *block, int stride,
+                                  const uint8_t nnzc[6*8]);
+void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
+                            DCTELEM *block, int stride,
+                            const uint8_t nnzc[6*8]);
+
+#if HAVE_NEON
+static void ff_h264dsp_init_neon(H264DSPContext *c)
+{
+    c->h264_v_loop_filter_luma   = ff_h264_v_loop_filter_luma_neon;
+    c->h264_h_loop_filter_luma   = ff_h264_h_loop_filter_luma_neon;
+    c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
+    c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
+
+    c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon;
+    c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon;
+    c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon;
+    c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon;
+    c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon;
+    c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon;
+    c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon;
+    c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon;
+
+    c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon;
+    c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon;
+    c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon;
+    c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon;
+    c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon;
+    c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon;
+    c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon;
+    c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon;
+
+    c->h264_idct_add        = ff_h264_idct_add_neon;
+    c->h264_idct_dc_add     = ff_h264_idct_dc_add_neon;
+    c->h264_idct_add16      = ff_h264_idct_add16_neon;
+    c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
+    c->h264_idct_add8       = ff_h264_idct_add8_neon;
+}
+#endif
+
+void ff_h264dsp_init_arm(H264DSPContext *c)
+{
+    if (HAVE_NEON) ff_h264dsp_init_neon(c);
+}

Modified: trunk/libavcodec/dsputil.c
==============================================================================
--- trunk/libavcodec/dsputil.c	Tue Mar 16 00:40:51 2010	(r22564)
+++ trunk/libavcodec/dsputil.c	Tue Mar 16 02:17:00 2010	(r22565)
@@ -2597,76 +2597,6 @@ H264_MC(avg_, 16)
 #undef op2_put
 #endif
 
-#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
-#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
-#define H264_WEIGHT(W,H) \
-static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
-    int y; \
-    offset <<= log2_denom; \
-    if(log2_denom) offset += 1<<(log2_denom-1); \
-    for(y=0; y<H; y++, block += stride){ \
-        op_scale1(0); \
-        op_scale1(1); \
-        if(W==2) continue; \
-        op_scale1(2); \
-        op_scale1(3); \
-        if(W==4) continue; \
-        op_scale1(4); \
-        op_scale1(5); \
-        op_scale1(6); \
-        op_scale1(7); \
-        if(W==8) continue; \
-        op_scale1(8); \
-        op_scale1(9); \
-        op_scale1(10); \
-        op_scale1(11); \
-        op_scale1(12); \
-        op_scale1(13); \
-        op_scale1(14); \
-        op_scale1(15); \
-    } \
-} \
-static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
-    int y; \
-    offset = ((offset + 1) | 1) << log2_denom; \
-    for(y=0; y<H; y++, dst += stride, src += stride){ \
-        op_scale2(0); \
-        op_scale2(1); \
-        if(W==2) continue; \
-        op_scale2(2); \
-        op_scale2(3); \
-        if(W==4) continue; \
-        op_scale2(4); \
-        op_scale2(5); \
-        op_scale2(6); \
-        op_scale2(7); \
-        if(W==8) continue; \
-        op_scale2(8); \
-        op_scale2(9); \
-        op_scale2(10); \
-        op_scale2(11); \
-        op_scale2(12); \
-        op_scale2(13); \
-        op_scale2(14); \
-        op_scale2(15); \
-    } \
-}
-
-H264_WEIGHT(16,16)
-H264_WEIGHT(16,8)
-H264_WEIGHT(8,16)
-H264_WEIGHT(8,8)
-H264_WEIGHT(8,4)
-H264_WEIGHT(4,8)
-H264_WEIGHT(4,4)
-H264_WEIGHT(4,2)
-H264_WEIGHT(2,4)
-H264_WEIGHT(2,2)
-
-#undef op_scale1
-#undef op_scale2
-#undef H264_WEIGHT
-
 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
     int i;
@@ -2711,9 +2641,6 @@ void ff_avg_vc1_mspel_mc00_c(uint8_t *ds
 }
 #endif /* CONFIG_VC1_DECODER */
 
-/* H264 specific */
-void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
-
 #if CONFIG_RV40_DECODER
 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
     put_pixels16_xy2_c(dst, src, stride, 16);
@@ -2907,179 +2834,6 @@ static void h261_loop_filter_c(uint8_t *
     }
 }
 
-static av_always_inline av_flatten void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
-{
-    int i, d;
-    for( i = 0; i < 4; i++ ) {
-        if( tc0[i] < 0 ) {
-            pix += 4*ystride;
-            continue;
-        }
-        for( d = 0; d < 4; d++ ) {
-            const int p0 = pix[-1*xstride];
-            const int p1 = pix[-2*xstride];
-            const int p2 = pix[-3*xstride];
-            const int q0 = pix[0];
-            const int q1 = pix[1*xstride];
-            const int q2 = pix[2*xstride];
-
-            if( FFABS( p0 - q0 ) < alpha &&
-                FFABS( p1 - p0 ) < beta &&
-                FFABS( q1 - q0 ) < beta ) {
-
-                int tc = tc0[i];
-                int i_delta;
-
-                if( FFABS( p2 - p0 ) < beta ) {
-                    if(tc0[i])
-                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
-                    tc++;
-                }
-                if( FFABS( q2 - q0 ) < beta ) {
-                    if(tc0[i])
-                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
-                    tc++;
-                }
-
-                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
-                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
-                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
-            }
-            pix += ystride;
-        }
-    }
-}
-static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
-{
-    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
-}
-static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
-{
-    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
-}
-
-static av_always_inline av_flatten void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
-{
-    int d;
-    for( d = 0; d < 16; d++ ) {
-        const int p2 = pix[-3*xstride];
-        const int p1 = pix[-2*xstride];
-        const int p0 = pix[-1*xstride];
-
-        const int q0 = pix[ 0*xstride];
-        const int q1 = pix[ 1*xstride];
-        const int q2 = pix[ 2*xstride];
-
-        if( FFABS( p0 - q0 ) < alpha &&
-            FFABS( p1 - p0 ) < beta &&
-            FFABS( q1 - q0 ) < beta ) {
-
-            if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
-                if( FFABS( p2 - p0 ) < beta)
-                {
-                    const int p3 = pix[-4*xstride];
-                    /* p0', p1', p2' */
-                    pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
-                    pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
-                    pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
-                } else {
-                    /* p0' */
-                    pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
-                }
-                if( FFABS( q2 - q0 ) < beta)
-                {
-                    const int q3 = pix[3*xstride];
-                    /* q0', q1', q2' */
-                    pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
-                    pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
-                    pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
-                } else {
-                    /* q0' */
-                    pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
-                }
-            }else{
-                /* p0', q0' */
-                pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
-                pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
-            }
-        }
-        pix += ystride;
-    }
-}
-static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
-{
-    h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
-}
-static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
-{
-    h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
-}
-
-static av_always_inline av_flatten void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
-{
-    int i, d;
-    for( i = 0; i < 4; i++ ) {
-        const int tc = tc0[i];
-        if( tc <= 0 ) {
-            pix += 2*ystride;
-            continue;
-        }
-        for( d = 0; d < 2; d++ ) {
-            const int p0 = pix[-1*xstride];
-            const int p1 = pix[-2*xstride];
-            const int q0 = pix[0];
-            const int q1 = pix[1*xstride];
-
-            if( FFABS( p0 - q0 ) < alpha &&
-                FFABS( p1 - p0 ) < beta &&
-                FFABS( q1 - q0 ) < beta ) {
-
-                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
-
-                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
-                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
-            }
-            pix += ystride;
-        }
-    }
-}
-static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
-{
-    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
-}
-static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
-{
-    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
-}
-
-static av_always_inline av_flatten void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
-{
-    int d;
-    for( d = 0; d < 8; d++ ) {
-        const int p0 = pix[-1*xstride];
-        const int p1 = pix[-2*xstride];
-        const int q0 = pix[0];
-        const int q1 = pix[1*xstride];
-
-        if( FFABS( p0 - q0 ) < alpha &&
-            FFABS( p1 - p0 ) < beta &&
-            FFABS( q1 - q0 ) < beta ) {
-
-            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
-            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
-        }
-        pix += ystride;
-    }
-}
-static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
-{
-    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
-}
-static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
-{
-    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
-}
-
 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 {
     int s, i;
@@ -4502,17 +4256,6 @@ av_cold void dsputil_init(DSPContext* c,
         }
     }
 
-    if (CONFIG_H264_DECODER) {
-        c->h264_idct_add= ff_h264_idct_add_c;
-        c->h264_idct8_add= ff_h264_idct8_add_c;
-        c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
-        c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
-        c->h264_idct_add16     = ff_h264_idct_add16_c;
-        c->h264_idct8_add4     = ff_h264_idct8_add4_c;
-        c->h264_idct_add8      = ff_h264_idct_add8_c;
-        c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
-    }
-
     c->get_pixels = get_pixels_c;
     c->diff_pixels = diff_pixels_c;
     c->put_pixels_clamped = put_pixels_clamped_c;
@@ -4635,27 +4378,6 @@ av_cold void dsputil_init(DSPContext* c,
     c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
     c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
 
-    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
-    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
-    c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
-    c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
-    c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
-    c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
-    c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
-    c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
-    c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
-    c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
-    c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
-    c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
-    c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
-    c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
-    c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
-    c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
-    c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
-    c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
-    c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
-    c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
-
     c->draw_edges = draw_edges_c;
 
 #if CONFIG_CAVS_DECODER
@@ -4737,16 +4459,6 @@ av_cold void dsputil_init(DSPContext* c,
     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
 #endif
 
-    c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
-    c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
-    c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
-    c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
-    c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
-    c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
-    c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
-    c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
-    c->h264_loop_filter_strength= NULL;
-
     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
         c->h263_h_loop_filter= h263_h_loop_filter_c;
         c->h263_v_loop_filter= h263_v_loop_filter_c;

Modified: trunk/libavcodec/dsputil.h
==============================================================================
--- trunk/libavcodec/dsputil.h	Tue Mar 16 00:40:51 2010	(r22564)
+++ trunk/libavcodec/dsputil.h	Tue Mar 16 02:17:00 2010	(r22565)
@@ -149,8 +149,6 @@ typedef void (*op_pixels_func)(uint8_t *
 typedef void (*tpel_mc_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int w, int h);
 typedef void (*qpel_mc_func)(uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
 typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
-typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset);
-typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset);
 
 typedef void (*op_fill_func)(uint8_t *block/*align width (8 or 16)*/, uint8_t value, int line_size, int h);
 
@@ -340,9 +338,6 @@ typedef struct DSPContext {
     qpel_mc_func put_2tap_qpel_pixels_tab[4][16];
     qpel_mc_func avg_2tap_qpel_pixels_tab[4][16];
 
-    h264_weight_func weight_h264_pixels_tab[10];
-    h264_biweight_func biweight_h264_pixels_tab[10];
-
     /* AVS specific */
     qpel_mc_func put_cavs_qpel_pixels_tab[2][16];
     qpel_mc_func avg_cavs_qpel_pixels_tab[2][16];
@@ -370,19 +365,6 @@ typedef struct DSPContext {
     void (*add_png_paeth_prediction)(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
     void (*bswap_buf)(uint32_t *dst, const uint32_t *src, int w);
 
-    void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);
-    void (*h264_h_loop_filter_luma)(uint8_t *pix/*align 4 */, int stride, int alpha, int beta, int8_t *tc0);
-    /* v/h_loop_filter_luma_intra: align 16 */
-    void (*h264_v_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta);
-    void (*h264_h_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta);
-    void (*h264_v_loop_filter_chroma)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta, int8_t *tc0);
-    void (*h264_h_loop_filter_chroma)(uint8_t *pix/*align 4*/, int stride, int alpha, int beta, int8_t *tc0);
-    void (*h264_v_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta);
-    void (*h264_h_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta);
-    // h264_loop_filter_strength: simd only. the C version is inlined in h264.c
-    void (*h264_loop_filter_strength)(int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
-                                      int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field);
-
     void (*h263_v_loop_filter)(uint8_t *src, int stride, int qscale);
     void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale);
 
@@ -517,21 +499,6 @@ typedef struct DSPContext {
     void (*draw_edges)(uint8_t *buf, int wrap, int width, int height, int w);
 #define EDGE_WIDTH 16
 
-    /* h264 functions */
-    /* NOTE!!! if you implement any of h264_idct8_add, h264_idct8_add4 then you must implement all of them
-       NOTE!!! if you implement any of h264_idct_add, h264_idct_add16, h264_idct_add16intra, h264_idct_add8 then you must implement all of them
-        The reason for above, is that no 2 out of one list may use a different permutation.
-    */
-    void (*h264_idct_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride);
-    void (*h264_idct8_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
-    void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride);
-    void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
-    void (*h264_dct)(DCTELEM block[4][4]);
-    void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
-    void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
-    void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
-    void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
-
     void (*prefetch)(void *mem, int stride, int h);
 
     void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);

Modified: trunk/libavcodec/h264.c
==============================================================================
--- trunk/libavcodec/h264.c	Tue Mar 16 00:40:51 2010	(r22564)
+++ trunk/libavcodec/h264.c	Tue Mar 16 02:17:00 2010	(r22565)
@@ -678,7 +678,7 @@ static void free_tables(H264Context *h){
 
 static void init_dequant8_coeff_table(H264Context *h){
     int i,q,x;
-    const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
+    const int transpose = (h->h264dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
     h->dequant8_coeff[0] = h->dequant8_buffer[0];
     h->dequant8_coeff[1] = h->dequant8_buffer[1];
 
@@ -701,7 +701,7 @@ static void init_dequant8_coeff_table(H2
 
 static void init_dequant4_coeff_table(H264Context *h){
     int i,j,q,x;
-    const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
+    const int transpose = (h->h264dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
     for(i=0; i<6; i++ ){
         h->dequant4_coeff[i] = h->dequant4_buffer[i];
         for(j=0; j<i; j++){
@@ -831,6 +831,7 @@ static av_cold void common_init(H264Cont
     s->height = s->avctx->height;
     s->codec_id= s->avctx->codec->id;
 
+    ff_h264dsp_init(&h->h264dsp);
     ff_h264_pred_init(&h->hpc, s->codec_id);
 
     h->dequant_coeff_pps= -1;
@@ -1159,8 +1160,8 @@ static av_always_inline void hl_decode_m
                             idct_dc_add =
                             idct_add    = s->dsp.add_pixels8;
                         }else{
-                            idct_dc_add = s->dsp.h264_idct8_dc_add;
-                            idct_add    = s->dsp.h264_idct8_add;
+                            idct_dc_add = h->h264dsp.h264_idct8_dc_add;
+                            idct_add    = h->h264dsp.h264_idct8_add;
                         }
                         for(i=0; i<16; i+=4){
                             uint8_t * const ptr= dest_y + block_offset[i];
@@ -1184,8 +1185,8 @@ static av_always_inline void hl_decode_m
                             idct_dc_add =
                             idct_add    = s->dsp.add_pixels4;
                         }else{
-                            idct_dc_add = s->dsp.h264_idct_dc_add;
-                            idct_add    = s->dsp.h264_idct_add;
+                            idct_dc_add = h->h264dsp.h264_idct_dc_add;
+                            idct_add    = h->h264dsp.h264_idct_add;
                         }
                         for(i=0; i<16; i++){
                             uint8_t * const ptr= dest_y + block_offset[i];
@@ -1236,7 +1237,7 @@ static av_always_inline void hl_decode_m
             hl_motion(h, dest_y, dest_cb, dest_cr,
                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
-                      s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
+                      h->h264dsp.weight_h264_pixels_tab, h->h264dsp.biweight_h264_pixels_tab);
         }
 
 
@@ -1253,7 +1254,7 @@ static av_always_inline void hl_decode_m
                             }
                         }
                     }else{
-                         s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
+                         h->h264dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
                     }
                 }else if(h->cbp&15){
                     if(transform_bypass){
@@ -1266,9 +1267,9 @@ static av_always_inline void hl_decode_m
                         }
                     }else{
                         if(IS_8x8DCT(mb_type)){
-                            s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
+                            h->h264dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
                         }else{
-                            s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
+                            h->h264dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
                         }
                     }
                 }
@@ -1299,8 +1300,8 @@ static av_always_inline void hl_decode_m
                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
                 if(is_h264){
-                    idct_add = s->dsp.h264_idct_add;
-                    idct_dc_add = s->dsp.h264_idct_dc_add;
+                    idct_add = h->h264dsp.h264_idct_add;
+                    idct_dc_add = h->h264dsp.h264_idct_dc_add;
                     for(i=16; i<16+8; i++){
                         if(h->non_zero_count_cache[ scan8[i] ])
                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
@@ -1560,7 +1561,7 @@ static int init_poc(H264Context *h){
 static void init_scan_tables(H264Context *h){
     MpegEncContext * const s = &h->s;
     int i;
-    if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
+    if(h->h264dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
     }else{
@@ -1571,7 +1572,7 @@ static void init_scan_tables(H264Context
 #undef T
         }
     }
-    if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
+    if(h->h264dsp.h264_idct8_add == ff_h264_idct8_add_c){
         memcpy(h->zigzag_scan8x8,       ff_zigzag_direct,     64*sizeof(uint8_t));
         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
@@ -3003,7 +3004,7 @@ int main(void){
         }
 //        printf("\n");
 
-        s->dsp.h264_idct_add(ref, block, 4);
+        h->h264dsp.h264_idct_add(ref, block, 4);
 /*        for(j=0; j<16; j++){
             printf("%d ", ref[j]);
         }

Modified: trunk/libavcodec/h264.h
==============================================================================
--- trunk/libavcodec/h264.h	Tue Mar 16 00:40:51 2010	(r22564)
+++ trunk/libavcodec/h264.h	Tue Mar 16 02:17:00 2010	(r22565)
@@ -32,6 +32,7 @@
 #include "dsputil.h"
 #include "cabac.h"
 #include "mpegvideo.h"
+#include "h264dsp.h"
 #include "h264pred.h"
 #include "rectangle.h"
 
@@ -262,6 +263,7 @@ typedef struct MMCO{
  */
 typedef struct H264Context{
     MpegEncContext s;
+    H264DSPContext h264dsp;
     int chroma_qp[2]; //QPc
 
     int qp_thresh;      ///< QP threshold to skip loopfilter

Modified: trunk/libavcodec/h264_loopfilter.c
==============================================================================
--- trunk/libavcodec/h264_loopfilter.c	Tue Mar 16 00:40:51 2010	(r22564)
+++ trunk/libavcodec/h264_loopfilter.c	Tue Mar 16 02:17:00 2010	(r22565)
@@ -112,9 +112,9 @@ static void av_always_inline filter_mb_e
         tc[1] = tc0_table[index_a][bS[1]];
         tc[2] = tc0_table[index_a][bS[2]];
         tc[3] = tc0_table[index_a][bS[3]];
-        h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
+        h->h264dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
     } else {
-        h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
+        h->h264dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
     }
 }
 static void av_always_inline filter_mb_edgecv( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, H264Context *h ) {
@@ -129,9 +129,9 @@ static void av_always_inline filter_mb_e
         tc[1] = tc0_table[index_a][bS[1]]+1;
         tc[2] = tc0_table[index_a][bS[2]]+1;
         tc[3] = tc0_table[index_a][bS[3]]+1;
-        h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
+        h->h264dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
     } else {
-        h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
+        h->h264dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
     }
 }
 
@@ -282,9 +282,9 @@ static void av_always_inline filter_mb_e
         tc[1] = tc0_table[index_a][bS[1]];
         tc[2] = tc0_table[index_a][bS[2]];
         tc[3] = tc0_table[index_a][bS[3]];
-        h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
+        h->h264dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
     } else {
-        h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
+        h->h264dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
     }
 }
 
@@ -300,9 +300,9 @@ static void av_always_inline filter_mb_e
         tc[1] = tc0_table[index_a][bS[1]]+1;
         tc[2] = tc0_table[index_a][bS[2]]+1;
         tc[3] = tc0_table[index_a][bS[3]]+1;
-        h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
+        h->h264dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
     } else {
-        h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
+        h->h264dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
     }
 }
 
@@ -314,7 +314,7 @@ void ff_h264_filter_mb_fast( H264Context
 
     mb_xy = h->mb_xy;
 
-    if(!h->top_type || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff) {
+    if(!h->top_type || !h->h264dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff) {
         ff_h264_filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
         return;
     }
@@ -381,7 +381,7 @@ void ff_h264_filter_mb_fast( H264Context
             int mask_edge0 = 3*((mask_edge1>>1) & ((5*left_type)>>5)&1); // (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) && (h->left_type[0] & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 : 0;
             int step =  1+(mb_type>>24); //IS_8x8DCT(mb_type) ? 2 : 1;
             edges = 4 - 3*((mb_type>>3) & !(h->cbp & 15)); //(mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
-            s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
+            h->h264dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
                                               h->list_count==2, edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
         }
         if( IS_INTRA(left_type) )

Copied and modified: trunk/libavcodec/h264dsp.c (from r22564, trunk/libavcodec/dsputil.c)
==============================================================================
--- trunk/libavcodec/dsputil.c	Tue Mar 16 00:40:51 2010	(r22564, copy source)
+++ trunk/libavcodec/h264dsp.c	Tue Mar 16 02:17:00 2010	(r22565)
@@ -1,9 +1,6 @@
 /*
- * DSP utils
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni at gmx.at>
- *
- * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni at gmx.at>
+ * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
+ * Copyright (c) 2003-2010 Michael Niedermayer <michaelni at gmx.at>
  *
  * This file is part of FFmpeg.
  *
@@ -23,2579 +20,14 @@
  */
 
 /**
- * @file libavcodec/dsputil.c
- * DSP utils
+ * @file libavcodec/h264dsp.c
+ * H.264 / AVC / MPEG4 part10 DSP functions.
+ * @author Michael Niedermayer <michaelni at gmx.at>
  */
 
+#include <stdint.h>
 #include "avcodec.h"
-#include "dsputil.h"
-#include "simple_idct.h"
-#include "faandct.h"
-#include "faanidct.h"
-#include "mathops.h"
-#include "mpegvideo.h"
-#include "config.h"
-#include "lpc.h"
-#include "ac3dec.h"
-#include "vorbis.h"
-#include "png.h"
-
-uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
-uint32_t ff_squareTbl[512] = {0, };
-
-// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
-#define pb_7f (~0UL/255 * 0x7f)
-#define pb_80 (~0UL/255 * 0x80)
-
-const uint8_t ff_zigzag_direct[64] = {
-    0,   1,  8, 16,  9,  2,  3, 10,
-    17, 24, 32, 25, 18, 11,  4,  5,
-    12, 19, 26, 33, 40, 48, 41, 34,
-    27, 20, 13,  6,  7, 14, 21, 28,
-    35, 42, 49, 56, 57, 50, 43, 36,
-    29, 22, 15, 23, 30, 37, 44, 51,
-    58, 59, 52, 45, 38, 31, 39, 46,
-    53, 60, 61, 54, 47, 55, 62, 63
-};
-
-/* Specific zigzag scan for 248 idct. NOTE that unlike the
-   specification, we interleave the fields */
-const uint8_t ff_zigzag248_direct[64] = {
-     0,  8,  1,  9, 16, 24,  2, 10,
-    17, 25, 32, 40, 48, 56, 33, 41,
-    18, 26,  3, 11,  4, 12, 19, 27,
-    34, 42, 49, 57, 50, 58, 35, 43,
-    20, 28,  5, 13,  6, 14, 21, 29,
-    36, 44, 51, 59, 52, 60, 37, 45,
-    22, 30,  7, 15, 23, 31, 38, 46,
-    53, 61, 54, 62, 39, 47, 55, 63,
-};
-
-/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
-DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
-
-const uint8_t ff_alternate_horizontal_scan[64] = {
-    0,  1,   2,  3,  8,  9, 16, 17,
-    10, 11,  4,  5,  6,  7, 15, 14,
-    13, 12, 19, 18, 24, 25, 32, 33,
-    26, 27, 20, 21, 22, 23, 28, 29,
-    30, 31, 34, 35, 40, 41, 48, 49,
-    42, 43, 36, 37, 38, 39, 44, 45,
-    46, 47, 50, 51, 56, 57, 58, 59,
-    52, 53, 54, 55, 60, 61, 62, 63,
-};
-
-const uint8_t ff_alternate_vertical_scan[64] = {
-    0,  8,  16, 24,  1,  9,  2, 10,
-    17, 25, 32, 40, 48, 56, 57, 49,
-    41, 33, 26, 18,  3, 11,  4, 12,
-    19, 27, 34, 42, 50, 58, 35, 43,
-    51, 59, 20, 28,  5, 13,  6, 14,
-    21, 29, 36, 44, 52, 60, 37, 45,
-    53, 61, 22, 30,  7, 15, 23, 31,
-    38, 46, 54, 62, 39, 47, 55, 63,
-};
-
-/* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
- * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
-const uint32_t ff_inverse[257]={
-         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
- 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
- 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
- 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
- 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
- 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
-  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
-  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
-  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
-  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
-  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
-  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
-  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
-  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
-  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
-  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
-  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
-  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
-  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
-  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
-  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
-  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
-  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
-  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
-  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
-  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
-  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
-  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
-  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
-  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
-  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
-  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
-  16777216
-};
-
-/* Input permutation for the simple_idct_mmx */
-static const uint8_t simple_mmx_permutation[64]={
-        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
-        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
-        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
-        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
-        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
-        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
-        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
-        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
-};
-
-static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
-
-void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
-    int i;
-    int end;
-
-    st->scantable= src_scantable;
-
-    for(i=0; i<64; i++){
-        int j;
-        j = src_scantable[i];
-        st->permutated[i] = permutation[j];
-#if ARCH_PPC
-        st->inverse[j] = i;
-#endif
-    }
-
-    end=-1;
-    for(i=0; i<64; i++){
-        int j;
-        j = st->permutated[i];
-        if(j>end) end=j;
-        st->raster_end[i]= end;
-    }
-}
-
-static int pix_sum_c(uint8_t * pix, int line_size)
-{
-    int s, i, j;
-
-    s = 0;
-    for (i = 0; i < 16; i++) {
-        for (j = 0; j < 16; j += 8) {
-            s += pix[0];
-            s += pix[1];
-            s += pix[2];
-            s += pix[3];
-            s += pix[4];
-            s += pix[5];
-            s += pix[6];
-            s += pix[7];
-            pix += 8;
-        }
-        pix += line_size - 16;
-    }
-    return s;
-}
-
-static int pix_norm1_c(uint8_t * pix, int line_size)
-{
-    int s, i, j;
-    uint32_t *sq = ff_squareTbl + 256;
-
-    s = 0;
-    for (i = 0; i < 16; i++) {
-        for (j = 0; j < 16; j += 8) {
-#if 0
-            s += sq[pix[0]];
-            s += sq[pix[1]];
-            s += sq[pix[2]];
-            s += sq[pix[3]];
-            s += sq[pix[4]];
-            s += sq[pix[5]];
-            s += sq[pix[6]];
-            s += sq[pix[7]];
-#else
-#if LONG_MAX > 2147483647
-            register uint64_t x=*(uint64_t*)pix;
-            s += sq[x&0xff];
-            s += sq[(x>>8)&0xff];
-            s += sq[(x>>16)&0xff];
-            s += sq[(x>>24)&0xff];
-            s += sq[(x>>32)&0xff];
-            s += sq[(x>>40)&0xff];
-            s += sq[(x>>48)&0xff];
-            s += sq[(x>>56)&0xff];
-#else
-            register uint32_t x=*(uint32_t*)pix;
-            s += sq[x&0xff];
-            s += sq[(x>>8)&0xff];
-            s += sq[(x>>16)&0xff];
-            s += sq[(x>>24)&0xff];
-            x=*(uint32_t*)(pix+4);
-            s += sq[x&0xff];
-            s += sq[(x>>8)&0xff];
-            s += sq[(x>>16)&0xff];
-            s += sq[(x>>24)&0xff];
-#endif
-#endif
-            pix += 8;
-        }
-        pix += line_size - 16;
-    }
-    return s;
-}
-
-static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
-    int i;
-
-    for(i=0; i+8<=w; i+=8){
-        dst[i+0]= bswap_32(src[i+0]);
-        dst[i+1]= bswap_32(src[i+1]);
-        dst[i+2]= bswap_32(src[i+2]);
-        dst[i+3]= bswap_32(src[i+3]);
-        dst[i+4]= bswap_32(src[i+4]);
-        dst[i+5]= bswap_32(src[i+5]);
-        dst[i+6]= bswap_32(src[i+6]);
-        dst[i+7]= bswap_32(src[i+7]);
-    }
-    for(;i<w; i++){
-        dst[i+0]= bswap_32(src[i+0]);
-    }
-}
-
-static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
-{
-    int s, i;
-    uint32_t *sq = ff_squareTbl + 256;
-
-    s = 0;
-    for (i = 0; i < h; i++) {
-        s += sq[pix1[0] - pix2[0]];
-        s += sq[pix1[1] - pix2[1]];
-        s += sq[pix1[2] - pix2[2]];
-        s += sq[pix1[3] - pix2[3]];
-        pix1 += line_size;
-        pix2 += line_size;
-    }
-    return s;
-}
-
-static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
-{
-    int s, i;
-    uint32_t *sq = ff_squareTbl + 256;
-
-    s = 0;
-    for (i = 0; i < h; i++) {
-        s += sq[pix1[0] - pix2[0]];
-        s += sq[pix1[1] - pix2[1]];
-        s += sq[pix1[2] - pix2[2]];
-        s += sq[pix1[3] - pix2[3]];
-        s += sq[pix1[4] - pix2[4]];
-        s += sq[pix1[5] - pix2[5]];
-        s += sq[pix1[6] - pix2[6]];
-        s += sq[pix1[7] - pix2[7]];
-        pix1 += line_size;
-        pix2 += line_size;
-    }
-    return s;
-}
-
-static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
-{
-    int s, i;
-    uint32_t *sq = ff_squareTbl + 256;
-
-    s = 0;
-    for (i = 0; i < h; i++) {
-        s += sq[pix1[ 0] - pix2[ 0]];
-        s += sq[pix1[ 1] - pix2[ 1]];
-        s += sq[pix1[ 2] - pix2[ 2]];
-        s += sq[pix1[ 3] - pix2[ 3]];
-        s += sq[pix1[ 4] - pix2[ 4]];
-        s += sq[pix1[ 5] - pix2[ 5]];
-        s += sq[pix1[ 6] - pix2[ 6]];
-        s += sq[pix1[ 7] - pix2[ 7]];
-        s += sq[pix1[ 8] - pix2[ 8]];
-        s += sq[pix1[ 9] - pix2[ 9]];
-        s += sq[pix1[10] - pix2[10]];
-        s += sq[pix1[11] - pix2[11]];
-        s += sq[pix1[12] - pix2[12]];
-        s += sq[pix1[13] - pix2[13]];
-        s += sq[pix1[14] - pix2[14]];
-        s += sq[pix1[15] - pix2[15]];
-
-        pix1 += line_size;
-        pix2 += line_size;
-    }
-    return s;
-}
-
-/* draw the edges of width 'w' of an image of size width, height */
-//FIXME check that this is ok for mpeg4 interlaced
-static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
-{
-    uint8_t *ptr, *last_line;
-    int i;
-
-    last_line = buf + (height - 1) * wrap;
-    for(i=0;i<w;i++) {
-        /* top and bottom */
-        memcpy(buf - (i + 1) * wrap, buf, width);
-        memcpy(last_line + (i + 1) * wrap, last_line, width);
-    }
-    /* left and right */
-    ptr = buf;
-    for(i=0;i<height;i++) {
-        memset(ptr - w, ptr[0], w);
-        memset(ptr + width, ptr[width-1], w);
-        ptr += wrap;
-    }
-    /* corners */
-    for(i=0;i<w;i++) {
-        memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
-        memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
-        memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
-        memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
-    }
-}
-
-/**
- * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
- * @param buf destination buffer
- * @param src source buffer
- * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
- * @param block_w width of block
- * @param block_h height of block
- * @param src_x x coordinate of the top left sample of the block in the source buffer
- * @param src_y y coordinate of the top left sample of the block in the source buffer
- * @param w width of the source buffer
- * @param h height of the source buffer
- */
-void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
-                                    int src_x, int src_y, int w, int h){
-    int x, y;
-    int start_y, start_x, end_y, end_x;
-
-    if(src_y>= h){
-        src+= (h-1-src_y)*linesize;
-        src_y=h-1;
-    }else if(src_y<=-block_h){
-        src+= (1-block_h-src_y)*linesize;
-        src_y=1-block_h;
-    }
-    if(src_x>= w){
-        src+= (w-1-src_x);
-        src_x=w-1;
-    }else if(src_x<=-block_w){
-        src+= (1-block_w-src_x);
-        src_x=1-block_w;
-    }
-
-    start_y= FFMAX(0, -src_y);
-    start_x= FFMAX(0, -src_x);
-    end_y= FFMIN(block_h, h-src_y);
-    end_x= FFMIN(block_w, w-src_x);
-
-    // copy existing part
-    for(y=start_y; y<end_y; y++){
-        for(x=start_x; x<end_x; x++){
-            buf[x + y*linesize]= src[x + y*linesize];
-        }
-    }
-
-    //top
-    for(y=0; y<start_y; y++){
-        for(x=start_x; x<end_x; x++){
-            buf[x + y*linesize]= buf[x + start_y*linesize];
-        }
-    }
-
-    //bottom
-    for(y=end_y; y<block_h; y++){
-        for(x=start_x; x<end_x; x++){
-            buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
-        }
-    }
-
-    for(y=0; y<block_h; y++){
-       //left
-        for(x=0; x<start_x; x++){
-            buf[x + y*linesize]= buf[start_x + y*linesize];
-        }
-
-       //right
-        for(x=end_x; x<block_w; x++){
-            buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
-        }
-    }
-}
-
-static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
-{
-    int i;
-
-    /* read the pixels */
-    for(i=0;i<8;i++) {
-        block[0] = pixels[0];
-        block[1] = pixels[1];
-        block[2] = pixels[2];
-        block[3] = pixels[3];
-        block[4] = pixels[4];
-        block[5] = pixels[5];
-        block[6] = pixels[6];
-        block[7] = pixels[7];
-        pixels += line_size;
-        block += 8;
-    }
-}
-
-static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
-                          const uint8_t *s2, int stride){
-    int i;
-
-    /* read the pixels */
-    for(i=0;i<8;i++) {
-        block[0] = s1[0] - s2[0];
-        block[1] = s1[1] - s2[1];
-        block[2] = s1[2] - s2[2];
-        block[3] = s1[3] - s2[3];
-        block[4] = s1[4] - s2[4];
-        block[5] = s1[5] - s2[5];
-        block[6] = s1[6] - s2[6];
-        block[7] = s1[7] - s2[7];
-        s1 += stride;
-        s2 += stride;
-        block += 8;
-    }
-}
-
-
-static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
-                                 int line_size)
-{
-    int i;
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-
-    /* read the pixels */
-    for(i=0;i<8;i++) {
-        pixels[0] = cm[block[0]];
-        pixels[1] = cm[block[1]];
-        pixels[2] = cm[block[2]];
-        pixels[3] = cm[block[3]];
-        pixels[4] = cm[block[4]];
-        pixels[5] = cm[block[5]];
-        pixels[6] = cm[block[6]];
-        pixels[7] = cm[block[7]];
-
-        pixels += line_size;
-        block += 8;
-    }
-}
-
-static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
-                                 int line_size)
-{
-    int i;
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-
-    /* read the pixels */
-    for(i=0;i<4;i++) {
-        pixels[0] = cm[block[0]];
-        pixels[1] = cm[block[1]];
-        pixels[2] = cm[block[2]];
-        pixels[3] = cm[block[3]];
-
-        pixels += line_size;
-        block += 8;
-    }
-}
-
-static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
-                                 int line_size)
-{
-    int i;
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-
-    /* read the pixels */
-    for(i=0;i<2;i++) {
-        pixels[0] = cm[block[0]];
-        pixels[1] = cm[block[1]];
-
-        pixels += line_size;
-        block += 8;
-    }
-}
-
-static void put_signed_pixels_clamped_c(const DCTELEM *block,
-                                        uint8_t *restrict pixels,
-                                        int line_size)
-{
-    int i, j;
-
-    for (i = 0; i < 8; i++) {
-        for (j = 0; j < 8; j++) {
-            if (*block < -128)
-                *pixels = 0;
-            else if (*block > 127)
-                *pixels = 255;
-            else
-                *pixels = (uint8_t)(*block + 128);
-            block++;
-            pixels++;
-        }
-        pixels += (line_size - 8);
-    }
-}
-
-static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
-                                    int line_size)
-{
-    int i;
-
-    /* read the pixels */
-    for(i=0;i<8;i++) {
-        pixels[0] = block[0];
-        pixels[1] = block[1];
-        pixels[2] = block[2];
-        pixels[3] = block[3];
-        pixels[4] = block[4];
-        pixels[5] = block[5];
-        pixels[6] = block[6];
-        pixels[7] = block[7];
-
-        pixels += line_size;
-        block += 8;
-    }
-}
-
-static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
-                          int line_size)
-{
-    int i;
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-
-    /* read the pixels */
-    for(i=0;i<8;i++) {
-        pixels[0] = cm[pixels[0] + block[0]];
-        pixels[1] = cm[pixels[1] + block[1]];
-        pixels[2] = cm[pixels[2] + block[2]];
-        pixels[3] = cm[pixels[3] + block[3]];
-        pixels[4] = cm[pixels[4] + block[4]];
-        pixels[5] = cm[pixels[5] + block[5]];
-        pixels[6] = cm[pixels[6] + block[6]];
-        pixels[7] = cm[pixels[7] + block[7]];
-        pixels += line_size;
-        block += 8;
-    }
-}
-
-static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
-                          int line_size)
-{
-    int i;
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-
-    /* read the pixels */
-    for(i=0;i<4;i++) {
-        pixels[0] = cm[pixels[0] + block[0]];
-        pixels[1] = cm[pixels[1] + block[1]];
-        pixels[2] = cm[pixels[2] + block[2]];
-        pixels[3] = cm[pixels[3] + block[3]];
-        pixels += line_size;
-        block += 8;
-    }
-}
-
-static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
-                          int line_size)
-{
-    int i;
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-
-    /* read the pixels */
-    for(i=0;i<2;i++) {
-        pixels[0] = cm[pixels[0] + block[0]];
-        pixels[1] = cm[pixels[1] + block[1]];
-        pixels += line_size;
-        block += 8;
-    }
-}
-
-static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
-{
-    int i;
-    for(i=0;i<8;i++) {
-        pixels[0] += block[0];
-        pixels[1] += block[1];
-        pixels[2] += block[2];
-        pixels[3] += block[3];
-        pixels[4] += block[4];
-        pixels[5] += block[5];
-        pixels[6] += block[6];
-        pixels[7] += block[7];
-        pixels += line_size;
-        block += 8;
-    }
-}
-
-static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
-{
-    int i;
-    for(i=0;i<4;i++) {
-        pixels[0] += block[0];
-        pixels[1] += block[1];
-        pixels[2] += block[2];
-        pixels[3] += block[3];
-        pixels += line_size;
-        block += 4;
-    }
-}
-
-static int sum_abs_dctelem_c(DCTELEM *block)
-{
-    int sum=0, i;
-    for(i=0; i<64; i++)
-        sum+= FFABS(block[i]);
-    return sum;
-}
-
-static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
-{
-    int i;
-
-    for (i = 0; i < h; i++) {
-        memset(block, value, 16);
-        block += line_size;
-    }
-}
-
-static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
-{
-    int i;
-
-    for (i = 0; i < h; i++) {
-        memset(block, value, 8);
-        block += line_size;
-    }
-}
-
-static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
-{
-    int i, j;
-    uint16_t *dst1 = (uint16_t *) dst;
-    uint16_t *dst2 = (uint16_t *)(dst + linesize);
-
-    for (j = 0; j < 8; j++) {
-        for (i = 0; i < 8; i++) {
-            dst1[i] = dst2[i] = src[i] * 0x0101;
-        }
-        src  += 8;
-        dst1 += linesize;
-        dst2 += linesize;
-    }
-}
-
-#if 0
-
-#define PIXOP2(OPNAME, OP) \
-static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
-{\
-    int i;\
-    for(i=0; i<h; i++){\
-        OP(*((uint64_t*)block), AV_RN64(pixels));\
-        pixels+=line_size;\
-        block +=line_size;\
-    }\
-}\
-\
-static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
-{\
-    int i;\
-    for(i=0; i<h; i++){\
-        const uint64_t a= AV_RN64(pixels  );\
-        const uint64_t b= AV_RN64(pixels+1);\
-        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
-        pixels+=line_size;\
-        block +=line_size;\
-    }\
-}\
-\
-static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
-{\
-    int i;\
-    for(i=0; i<h; i++){\
-        const uint64_t a= AV_RN64(pixels  );\
-        const uint64_t b= AV_RN64(pixels+1);\
-        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
-        pixels+=line_size;\
-        block +=line_size;\
-    }\
-}\
-\
-static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
-{\
-    int i;\
-    for(i=0; i<h; i++){\
-        const uint64_t a= AV_RN64(pixels          );\
-        const uint64_t b= AV_RN64(pixels+line_size);\
-        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
-        pixels+=line_size;\
-        block +=line_size;\
-    }\
-}\
-\
-static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
-{\
-    int i;\
-    for(i=0; i<h; i++){\
-        const uint64_t a= AV_RN64(pixels          );\
-        const uint64_t b= AV_RN64(pixels+line_size);\
-        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
-        pixels+=line_size;\
-        block +=line_size;\
-    }\
-}\
-\
-static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
-{\
-        int i;\
-        const uint64_t a= AV_RN64(pixels  );\
-        const uint64_t b= AV_RN64(pixels+1);\
-        uint64_t l0=  (a&0x0303030303030303ULL)\
-                    + (b&0x0303030303030303ULL)\
-                    + 0x0202020202020202ULL;\
-        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
-                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
-        uint64_t l1,h1;\
-\
-        pixels+=line_size;\
-        for(i=0; i<h; i+=2){\
-            uint64_t a= AV_RN64(pixels  );\
-            uint64_t b= AV_RN64(pixels+1);\
-            l1=  (a&0x0303030303030303ULL)\
-               + (b&0x0303030303030303ULL);\
-            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
-              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
-            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
-            pixels+=line_size;\
-            block +=line_size;\
-            a= AV_RN64(pixels  );\
-            b= AV_RN64(pixels+1);\
-            l0=  (a&0x0303030303030303ULL)\
-               + (b&0x0303030303030303ULL)\
-               + 0x0202020202020202ULL;\
-            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
-              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
-            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
-            pixels+=line_size;\
-            block +=line_size;\
-        }\
-}\
-\
-static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
-{\
-        int i;\
-        const uint64_t a= AV_RN64(pixels  );\
-        const uint64_t b= AV_RN64(pixels+1);\
-        uint64_t l0=  (a&0x0303030303030303ULL)\
-                    + (b&0x0303030303030303ULL)\
-                    + 0x0101010101010101ULL;\
-        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
-                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
-        uint64_t l1,h1;\
-\
-        pixels+=line_size;\
-        for(i=0; i<h; i+=2){\
-            uint64_t a= AV_RN64(pixels  );\
-            uint64_t b= AV_RN64(pixels+1);\
-            l1=  (a&0x0303030303030303ULL)\
-               + (b&0x0303030303030303ULL);\
-            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
-              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
-            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
-            pixels+=line_size;\
-            block +=line_size;\
-            a= AV_RN64(pixels  );\
-            b= AV_RN64(pixels+1);\
-            l0=  (a&0x0303030303030303ULL)\
-               + (b&0x0303030303030303ULL)\
-               + 0x0101010101010101ULL;\
-            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
-              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
-            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
-            pixels+=line_size;\
-            block +=line_size;\
-        }\
-}\
-\
-CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
-CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
-CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
-CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
-CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
-CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
-CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
-
-#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
-#else // 64 bit variant
-
-#define PIXOP2(OPNAME, OP) \
-static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    int i;\
-    for(i=0; i<h; i++){\
-        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
-        pixels+=line_size;\
-        block +=line_size;\
-    }\
-}\
-static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    int i;\
-    for(i=0; i<h; i++){\
-        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
-        pixels+=line_size;\
-        block +=line_size;\
-    }\
-}\
-static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    int i;\
-    for(i=0; i<h; i++){\
-        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
-        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
-        pixels+=line_size;\
-        block +=line_size;\
-    }\
-}\
-static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
-}\
-\
-static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
-                                                int src_stride1, int src_stride2, int h){\
-    int i;\
-    for(i=0; i<h; i++){\
-        uint32_t a,b;\
-        a= AV_RN32(&src1[i*src_stride1  ]);\
-        b= AV_RN32(&src2[i*src_stride2  ]);\
-        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
-        a= AV_RN32(&src1[i*src_stride1+4]);\
-        b= AV_RN32(&src2[i*src_stride2+4]);\
-        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
-    }\
-}\
-\
-static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
-                                                int src_stride1, int src_stride2, int h){\
-    int i;\
-    for(i=0; i<h; i++){\
-        uint32_t a,b;\
-        a= AV_RN32(&src1[i*src_stride1  ]);\
-        b= AV_RN32(&src2[i*src_stride2  ]);\
-        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
-        a= AV_RN32(&src1[i*src_stride1+4]);\
-        b= AV_RN32(&src2[i*src_stride2+4]);\
-        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
-    }\
-}\
-\
-static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
-                                                int src_stride1, int src_stride2, int h){\
-    int i;\
-    for(i=0; i<h; i++){\
-        uint32_t a,b;\
-        a= AV_RN32(&src1[i*src_stride1  ]);\
-        b= AV_RN32(&src2[i*src_stride2  ]);\
-        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
-    }\
-}\
-\
-static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
-                                                int src_stride1, int src_stride2, int h){\
-    int i;\
-    for(i=0; i<h; i++){\
-        uint32_t a,b;\
-        a= AV_RN16(&src1[i*src_stride1  ]);\
-        b= AV_RN16(&src2[i*src_stride2  ]);\
-        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
-    }\
-}\
-\
-static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
-                                                int src_stride1, int src_stride2, int h){\
-    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
-    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
-}\
-\
-static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
-                                                int src_stride1, int src_stride2, int h){\
-    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
-    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
-}\
-\
-static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
-}\
-\
-static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
-}\
-\
-static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
-}\
-\
-static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
-}\
-\
-static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
-                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
-    int i;\
-    for(i=0; i<h; i++){\
-        uint32_t a, b, c, d, l0, l1, h0, h1;\
-        a= AV_RN32(&src1[i*src_stride1]);\
-        b= AV_RN32(&src2[i*src_stride2]);\
-        c= AV_RN32(&src3[i*src_stride3]);\
-        d= AV_RN32(&src4[i*src_stride4]);\
-        l0=  (a&0x03030303UL)\
-           + (b&0x03030303UL)\
-           + 0x02020202UL;\
-        h0= ((a&0xFCFCFCFCUL)>>2)\
-          + ((b&0xFCFCFCFCUL)>>2);\
-        l1=  (c&0x03030303UL)\
-           + (d&0x03030303UL);\
-        h1= ((c&0xFCFCFCFCUL)>>2)\
-          + ((d&0xFCFCFCFCUL)>>2);\
-        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
-        a= AV_RN32(&src1[i*src_stride1+4]);\
-        b= AV_RN32(&src2[i*src_stride2+4]);\
-        c= AV_RN32(&src3[i*src_stride3+4]);\
-        d= AV_RN32(&src4[i*src_stride4+4]);\
-        l0=  (a&0x03030303UL)\
-           + (b&0x03030303UL)\
-           + 0x02020202UL;\
-        h0= ((a&0xFCFCFCFCUL)>>2)\
-          + ((b&0xFCFCFCFCUL)>>2);\
-        l1=  (c&0x03030303UL)\
-           + (d&0x03030303UL);\
-        h1= ((c&0xFCFCFCFCUL)>>2)\
-          + ((d&0xFCFCFCFCUL)>>2);\
-        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
-    }\
-}\
-\
-static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
-}\
-\
-static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
-}\
-\
-static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
-}\
-\
-static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
-}\
-\
-static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
-                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
-    int i;\
-    for(i=0; i<h; i++){\
-        uint32_t a, b, c, d, l0, l1, h0, h1;\
-        a= AV_RN32(&src1[i*src_stride1]);\
-        b= AV_RN32(&src2[i*src_stride2]);\
-        c= AV_RN32(&src3[i*src_stride3]);\
-        d= AV_RN32(&src4[i*src_stride4]);\
-        l0=  (a&0x03030303UL)\
-           + (b&0x03030303UL)\
-           + 0x01010101UL;\
-        h0= ((a&0xFCFCFCFCUL)>>2)\
-          + ((b&0xFCFCFCFCUL)>>2);\
-        l1=  (c&0x03030303UL)\
-           + (d&0x03030303UL);\
-        h1= ((c&0xFCFCFCFCUL)>>2)\
-          + ((d&0xFCFCFCFCUL)>>2);\
-        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
-        a= AV_RN32(&src1[i*src_stride1+4]);\
-        b= AV_RN32(&src2[i*src_stride2+4]);\
-        c= AV_RN32(&src3[i*src_stride3+4]);\
-        d= AV_RN32(&src4[i*src_stride4+4]);\
-        l0=  (a&0x03030303UL)\
-           + (b&0x03030303UL)\
-           + 0x01010101UL;\
-        h0= ((a&0xFCFCFCFCUL)>>2)\
-          + ((b&0xFCFCFCFCUL)>>2);\
-        l1=  (c&0x03030303UL)\
-           + (d&0x03030303UL);\
-        h1= ((c&0xFCFCFCFCUL)>>2)\
-          + ((d&0xFCFCFCFCUL)>>2);\
-        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
-    }\
-}\
-static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
-                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
-    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
-    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
-}\
-static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
-                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
-    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
-    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
-}\
-\
-static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
-{\
-        int i, a0, b0, a1, b1;\
-        a0= pixels[0];\
-        b0= pixels[1] + 2;\
-        a0 += b0;\
-        b0 += pixels[2];\
-\
-        pixels+=line_size;\
-        for(i=0; i<h; i+=2){\
-            a1= pixels[0];\
-            b1= pixels[1];\
-            a1 += b1;\
-            b1 += pixels[2];\
-\
-            block[0]= (a1+a0)>>2; /* FIXME non put */\
-            block[1]= (b1+b0)>>2;\
-\
-            pixels+=line_size;\
-            block +=line_size;\
-\
-            a0= pixels[0];\
-            b0= pixels[1] + 2;\
-            a0 += b0;\
-            b0 += pixels[2];\
-\
-            block[0]= (a1+a0)>>2;\
-            block[1]= (b1+b0)>>2;\
-            pixels+=line_size;\
-            block +=line_size;\
-        }\
-}\
-\
-static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
-{\
-        int i;\
-        const uint32_t a= AV_RN32(pixels  );\
-        const uint32_t b= AV_RN32(pixels+1);\
-        uint32_t l0=  (a&0x03030303UL)\
-                    + (b&0x03030303UL)\
-                    + 0x02020202UL;\
-        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
-                   + ((b&0xFCFCFCFCUL)>>2);\
-        uint32_t l1,h1;\
-\
-        pixels+=line_size;\
-        for(i=0; i<h; i+=2){\
-            uint32_t a= AV_RN32(pixels  );\
-            uint32_t b= AV_RN32(pixels+1);\
-            l1=  (a&0x03030303UL)\
-               + (b&0x03030303UL);\
-            h1= ((a&0xFCFCFCFCUL)>>2)\
-              + ((b&0xFCFCFCFCUL)>>2);\
-            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
-            pixels+=line_size;\
-            block +=line_size;\
-            a= AV_RN32(pixels  );\
-            b= AV_RN32(pixels+1);\
-            l0=  (a&0x03030303UL)\
-               + (b&0x03030303UL)\
-               + 0x02020202UL;\
-            h0= ((a&0xFCFCFCFCUL)>>2)\
-              + ((b&0xFCFCFCFCUL)>>2);\
-            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
-            pixels+=line_size;\
-            block +=line_size;\
-        }\
-}\
-\
-static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
-{\
-    int j;\
-    for(j=0; j<2; j++){\
-        int i;\
-        const uint32_t a= AV_RN32(pixels  );\
-        const uint32_t b= AV_RN32(pixels+1);\
-        uint32_t l0=  (a&0x03030303UL)\
-                    + (b&0x03030303UL)\
-                    + 0x02020202UL;\
-        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
-                   + ((b&0xFCFCFCFCUL)>>2);\
-        uint32_t l1,h1;\
-\
-        pixels+=line_size;\
-        for(i=0; i<h; i+=2){\
-            uint32_t a= AV_RN32(pixels  );\
-            uint32_t b= AV_RN32(pixels+1);\
-            l1=  (a&0x03030303UL)\
-               + (b&0x03030303UL);\
-            h1= ((a&0xFCFCFCFCUL)>>2)\
-              + ((b&0xFCFCFCFCUL)>>2);\
-            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
-            pixels+=line_size;\
-            block +=line_size;\
-            a= AV_RN32(pixels  );\
-            b= AV_RN32(pixels+1);\
-            l0=  (a&0x03030303UL)\
-               + (b&0x03030303UL)\
-               + 0x02020202UL;\
-            h0= ((a&0xFCFCFCFCUL)>>2)\
-              + ((b&0xFCFCFCFCUL)>>2);\
-            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
-            pixels+=line_size;\
-            block +=line_size;\
-        }\
-        pixels+=4-line_size*(h+1);\
-        block +=4-line_size*h;\
-    }\
-}\
-\
-static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
-{\
-    int j;\
-    for(j=0; j<2; j++){\
-        int i;\
-        const uint32_t a= AV_RN32(pixels  );\
-        const uint32_t b= AV_RN32(pixels+1);\
-        uint32_t l0=  (a&0x03030303UL)\
-                    + (b&0x03030303UL)\
-                    + 0x01010101UL;\
-        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
-                   + ((b&0xFCFCFCFCUL)>>2);\
-        uint32_t l1,h1;\
-\
-        pixels+=line_size;\
-        for(i=0; i<h; i+=2){\
-            uint32_t a= AV_RN32(pixels  );\
-            uint32_t b= AV_RN32(pixels+1);\
-            l1=  (a&0x03030303UL)\
-               + (b&0x03030303UL);\
-            h1= ((a&0xFCFCFCFCUL)>>2)\
-              + ((b&0xFCFCFCFCUL)>>2);\
-            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
-            pixels+=line_size;\
-            block +=line_size;\
-            a= AV_RN32(pixels  );\
-            b= AV_RN32(pixels+1);\
-            l0=  (a&0x03030303UL)\
-               + (b&0x03030303UL)\
-               + 0x01010101UL;\
-            h0= ((a&0xFCFCFCFCUL)>>2)\
-              + ((b&0xFCFCFCFCUL)>>2);\
-            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
-            pixels+=line_size;\
-            block +=line_size;\
-        }\
-        pixels+=4-line_size*(h+1);\
-        block +=4-line_size*h;\
-    }\
-}\
-\
-CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
-CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
-CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
-CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
-CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
-CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
-CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
-CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
-
-#define op_avg(a, b) a = rnd_avg32(a, b)
-#endif
-#define op_put(a, b) a = b
-
-PIXOP2(avg, op_avg)
-PIXOP2(put, op_put)
-#undef op_avg
-#undef op_put
-
-#define avg2(a,b) ((a+b+1)>>1)
-#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
-
-static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
-    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
-}
-
-static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
-    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
-}
-
-static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
-{
-    const int A=(16-x16)*(16-y16);
-    const int B=(   x16)*(16-y16);
-    const int C=(16-x16)*(   y16);
-    const int D=(   x16)*(   y16);
-    int i;
-
-    for(i=0; i<h; i++)
-    {
-        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
-        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
-        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
-        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
-        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
-        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
-        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
-        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
-        dst+= stride;
-        src+= stride;
-    }
-}
-
-void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
-                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
-{
-    int y, vx, vy;
-    const int s= 1<<shift;
-
-    width--;
-    height--;
-
-    for(y=0; y<h; y++){
-        int x;
-
-        vx= ox;
-        vy= oy;
-        for(x=0; x<8; x++){ //XXX FIXME optimize
-            int src_x, src_y, frac_x, frac_y, index;
-
-            src_x= vx>>16;
-            src_y= vy>>16;
-            frac_x= src_x&(s-1);
-            frac_y= src_y&(s-1);
-            src_x>>=shift;
-            src_y>>=shift;
-
-            if((unsigned)src_x < width){
-                if((unsigned)src_y < height){
-                    index= src_x + src_y*stride;
-                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
-                                           + src[index       +1]*   frac_x )*(s-frac_y)
-                                        + (  src[index+stride  ]*(s-frac_x)
-                                           + src[index+stride+1]*   frac_x )*   frac_y
-                                        + r)>>(shift*2);
-                }else{
-                    index= src_x + av_clip(src_y, 0, height)*stride;
-                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
-                                          + src[index       +1]*   frac_x )*s
-                                        + r)>>(shift*2);
-                }
-            }else{
-                if((unsigned)src_y < height){
-                    index= av_clip(src_x, 0, width) + src_y*stride;
-                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
-                                           + src[index+stride  ]*   frac_y )*s
-                                        + r)>>(shift*2);
-                }else{
-                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
-                    dst[y*stride + x]=    src[index         ];
-                }
-            }
-
-            vx+= dxx;
-            vy+= dyx;
-        }
-        ox += dxy;
-        oy += dyy;
-    }
-}
-
-static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
-    switch(width){
-    case 2: put_pixels2_c (dst, src, stride, height); break;
-    case 4: put_pixels4_c (dst, src, stride, height); break;
-    case 8: put_pixels8_c (dst, src, stride, height); break;
-    case 16:put_pixels16_c(dst, src, stride, height); break;
-    }
-}
-
-static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
-    int i,j;
-    for (i=0; i < height; i++) {
-      for (j=0; j < width; j++) {
-        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
-      }
-      src += stride;
-      dst += stride;
-    }
-}
-
-static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
-    int i,j;
-    for (i=0; i < height; i++) {
-      for (j=0; j < width; j++) {
-        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
-      }
-      src += stride;
-      dst += stride;
-    }
-}
-
-static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
-    int i,j;
-    for (i=0; i < height; i++) {
-      for (j=0; j < width; j++) {
-        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
-      }
-      src += stride;
-      dst += stride;
-    }
-}
-
-static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
-    int i,j;
-    for (i=0; i < height; i++) {
-      for (j=0; j < width; j++) {
-        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
-      }
-      src += stride;
-      dst += stride;
-    }
-}
-
-static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
-    int i,j;
-    for (i=0; i < height; i++) {
-      for (j=0; j < width; j++) {
-        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
-      }
-      src += stride;
-      dst += stride;
-    }
-}
-
-static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
-    int i,j;
-    for (i=0; i < height; i++) {
-      for (j=0; j < width; j++) {
-        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
-      }
-      src += stride;
-      dst += stride;
-    }
-}
-
-static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
-    int i,j;
-    for (i=0; i < height; i++) {
-      for (j=0; j < width; j++) {
-        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
-      }
-      src += stride;
-      dst += stride;
-    }
-}
-
-static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
-    int i,j;
-    for (i=0; i < height; i++) {
-      for (j=0; j < width; j++) {
-        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
-      }
-      src += stride;
-      dst += stride;
-    }
-}
-
-static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
-    switch(width){
-    case 2: avg_pixels2_c (dst, src, stride, height); break;
-    case 4: avg_pixels4_c (dst, src, stride, height); break;
-    case 8: avg_pixels8_c (dst, src, stride, height); break;
-    case 16:avg_pixels16_c(dst, src, stride, height); break;
-    }
-}
-
-static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
-    int i,j;
-    for (i=0; i < height; i++) {
-      for (j=0; j < width; j++) {
-        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
-      }
-      src += stride;
-      dst += stride;
-    }
-}
-
-static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
-    int i,j;
-    for (i=0; i < height; i++) {
-      for (j=0; j < width; j++) {
-        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
-      }
-      src += stride;
-      dst += stride;
-    }
-}
-
-static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
-    int i,j;
-    for (i=0; i < height; i++) {
-      for (j=0; j < width; j++) {
-        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
-      }
-      src += stride;
-      dst += stride;
-    }
-}
-
-static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
-    int i,j;
-    for (i=0; i < height; i++) {
-      for (j=0; j < width; j++) {
-        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
-      }
-      src += stride;
-      dst += stride;
-    }
-}
-
-static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
-    int i,j;
-    for (i=0; i < height; i++) {
-      for (j=0; j < width; j++) {
-        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
-      }
-      src += stride;
-      dst += stride;
-    }
-}
-
-static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
-    int i,j;
-    for (i=0; i < height; i++) {
-      for (j=0; j < width; j++) {
-        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
-      }
-      src += stride;
-      dst += stride;
-    }
-}
-
-static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
-    int i,j;
-    for (i=0; i < height; i++) {
-      for (j=0; j < width; j++) {
-        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
-      }
-      src += stride;
-      dst += stride;
-    }
-}
-
-static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
-    int i,j;
-    for (i=0; i < height; i++) {
-      for (j=0; j < width; j++) {
-        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
-      }
-      src += stride;
-      dst += stride;
-    }
-}
-#if 0
-#define TPEL_WIDTH(width)\
-static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
-    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
-static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
-    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
-static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
-    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
-static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
-    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
-static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
-    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
-static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
-    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
-static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
-    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
-static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
-    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
-static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
-    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
-#endif
-
-#define H264_CHROMA_MC(OPNAME, OP)\
-static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
-    const int A=(8-x)*(8-y);\
-    const int B=(  x)*(8-y);\
-    const int C=(8-x)*(  y);\
-    const int D=(  x)*(  y);\
-    int i;\
-    \
-    assert(x<8 && y<8 && x>=0 && y>=0);\
-\
-    if(D){\
-        for(i=0; i<h; i++){\
-            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
-            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
-            dst+= stride;\
-            src+= stride;\
-        }\
-    }else{\
-        const int E= B+C;\
-        const int step= C ? stride : 1;\
-        for(i=0; i<h; i++){\
-            OP(dst[0], (A*src[0] + E*src[step+0]));\
-            OP(dst[1], (A*src[1] + E*src[step+1]));\
-            dst+= stride;\
-            src+= stride;\
-        }\
-    }\
-}\
-\
-static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
-    const int A=(8-x)*(8-y);\
-    const int B=(  x)*(8-y);\
-    const int C=(8-x)*(  y);\
-    const int D=(  x)*(  y);\
-    int i;\
-    \
-    assert(x<8 && y<8 && x>=0 && y>=0);\
-\
-    if(D){\
-        for(i=0; i<h; i++){\
-            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
-            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
-            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
-            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
-            dst+= stride;\
-            src+= stride;\
-        }\
-    }else{\
-        const int E= B+C;\
-        const int step= C ? stride : 1;\
-        for(i=0; i<h; i++){\
-            OP(dst[0], (A*src[0] + E*src[step+0]));\
-            OP(dst[1], (A*src[1] + E*src[step+1]));\
-            OP(dst[2], (A*src[2] + E*src[step+2]));\
-            OP(dst[3], (A*src[3] + E*src[step+3]));\
-            dst+= stride;\
-            src+= stride;\
-        }\
-    }\
-}\
-\
-static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
-    const int A=(8-x)*(8-y);\
-    const int B=(  x)*(8-y);\
-    const int C=(8-x)*(  y);\
-    const int D=(  x)*(  y);\
-    int i;\
-    \
-    assert(x<8 && y<8 && x>=0 && y>=0);\
-\
-    if(D){\
-        for(i=0; i<h; i++){\
-            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
-            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
-            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
-            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
-            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
-            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
-            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
-            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
-            dst+= stride;\
-            src+= stride;\
-        }\
-    }else{\
-        const int E= B+C;\
-        const int step= C ? stride : 1;\
-        for(i=0; i<h; i++){\
-            OP(dst[0], (A*src[0] + E*src[step+0]));\
-            OP(dst[1], (A*src[1] + E*src[step+1]));\
-            OP(dst[2], (A*src[2] + E*src[step+2]));\
-            OP(dst[3], (A*src[3] + E*src[step+3]));\
-            OP(dst[4], (A*src[4] + E*src[step+4]));\
-            OP(dst[5], (A*src[5] + E*src[step+5]));\
-            OP(dst[6], (A*src[6] + E*src[step+6]));\
-            OP(dst[7], (A*src[7] + E*src[step+7]));\
-            dst+= stride;\
-            src+= stride;\
-        }\
-    }\
-}
-
-#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
-#define op_put(a, b) a = (((b) + 32)>>6)
-
-H264_CHROMA_MC(put_       , op_put)
-H264_CHROMA_MC(avg_       , op_avg)
-#undef op_avg
-#undef op_put
-
-static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
-    const int A=(8-x)*(8-y);
-    const int B=(  x)*(8-y);
-    const int C=(8-x)*(  y);
-    const int D=(  x)*(  y);
-    int i;
-
-    assert(x<8 && y<8 && x>=0 && y>=0);
-
-    for(i=0; i<h; i++)
-    {
-        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
-        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
-        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
-        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
-        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
-        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
-        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
-        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
-        dst+= stride;
-        src+= stride;
-    }
-}
-
-static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
-    const int A=(8-x)*(8-y);
-    const int B=(  x)*(8-y);
-    const int C=(8-x)*(  y);
-    const int D=(  x)*(  y);
-    int i;
-
-    assert(x<8 && y<8 && x>=0 && y>=0);
-
-    for(i=0; i<h; i++)
-    {
-        dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
-        dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
-        dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
-        dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
-        dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
-        dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
-        dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
-        dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
-        dst+= stride;
-        src+= stride;
-    }
-}
-
-#define QPEL_MC(r, OPNAME, RND, OP) \
-static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
-    int i;\
-    for(i=0; i<h; i++)\
-    {\
-        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
-        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
-        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
-        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
-        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
-        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
-        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
-        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
-        dst+=dstStride;\
-        src+=srcStride;\
-    }\
-}\
-\
-static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    const int w=8;\
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
-    int i;\
-    for(i=0; i<w; i++)\
-    {\
-        const int src0= src[0*srcStride];\
-        const int src1= src[1*srcStride];\
-        const int src2= src[2*srcStride];\
-        const int src3= src[3*srcStride];\
-        const int src4= src[4*srcStride];\
-        const int src5= src[5*srcStride];\
-        const int src6= src[6*srcStride];\
-        const int src7= src[7*srcStride];\
-        const int src8= src[8*srcStride];\
-        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
-        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
-        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
-        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
-        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
-        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
-        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
-        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
-        dst++;\
-        src++;\
-    }\
-}\
-\
-static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
-    int i;\
-    \
-    for(i=0; i<h; i++)\
-    {\
-        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
-        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
-        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
-        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
-        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
-        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
-        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
-        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
-        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
-        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
-        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
-        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
-        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
-        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
-        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
-        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
-        dst+=dstStride;\
-        src+=srcStride;\
-    }\
-}\
-\
-static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
-    int i;\
-    const int w=16;\
-    for(i=0; i<w; i++)\
-    {\
-        const int src0= src[0*srcStride];\
-        const int src1= src[1*srcStride];\
-        const int src2= src[2*srcStride];\
-        const int src3= src[3*srcStride];\
-        const int src4= src[4*srcStride];\
-        const int src5= src[5*srcStride];\
-        const int src6= src[6*srcStride];\
-        const int src7= src[7*srcStride];\
-        const int src8= src[8*srcStride];\
-        const int src9= src[9*srcStride];\
-        const int src10= src[10*srcStride];\
-        const int src11= src[11*srcStride];\
-        const int src12= src[12*srcStride];\
-        const int src13= src[13*srcStride];\
-        const int src14= src[14*srcStride];\
-        const int src15= src[15*srcStride];\
-        const int src16= src[16*srcStride];\
-        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
-        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
-        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
-        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
-        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
-        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
-        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
-        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
-        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
-        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
-        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
-        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
-        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
-        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
-        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
-        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
-        dst++;\
-        src++;\
-    }\
-}\
-\
-static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## pixels8_c(dst, src, stride, 8);\
-}\
-\
-static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t half[64];\
-    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
-    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
-}\
-\
-static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
-}\
-\
-static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t half[64];\
-    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
-    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
-}\
-\
-static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[16*9];\
-    uint8_t half[64];\
-    copy_block9(full, src, 16, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
-    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
-}\
-\
-static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[16*9];\
-    copy_block9(full, src, 16, stride, 9);\
-    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
-}\
-\
-static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[16*9];\
-    uint8_t half[64];\
-    copy_block9(full, src, 16, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
-    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
-}\
-void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[16*9];\
-    uint8_t halfH[72];\
-    uint8_t halfV[64];\
-    uint8_t halfHV[64];\
-    copy_block9(full, src, 16, stride, 9);\
-    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[16*9];\
-    uint8_t halfH[72];\
-    uint8_t halfHV[64];\
-    copy_block9(full, src, 16, stride, 9);\
-    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
-    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
-}\
-void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[16*9];\
-    uint8_t halfH[72];\
-    uint8_t halfV[64];\
-    uint8_t halfHV[64];\
-    copy_block9(full, src, 16, stride, 9);\
-    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[16*9];\
-    uint8_t halfH[72];\
-    uint8_t halfHV[64];\
-    copy_block9(full, src, 16, stride, 9);\
-    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
-    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
-}\
-void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[16*9];\
-    uint8_t halfH[72];\
-    uint8_t halfV[64];\
-    uint8_t halfHV[64];\
-    copy_block9(full, src, 16, stride, 9);\
-    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[16*9];\
-    uint8_t halfH[72];\
-    uint8_t halfHV[64];\
-    copy_block9(full, src, 16, stride, 9);\
-    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
-    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
-}\
-void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[16*9];\
-    uint8_t halfH[72];\
-    uint8_t halfV[64];\
-    uint8_t halfHV[64];\
-    copy_block9(full, src, 16, stride, 9);\
-    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[16*9];\
-    uint8_t halfH[72];\
-    uint8_t halfHV[64];\
-    copy_block9(full, src, 16, stride, 9);\
-    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
-    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t halfH[72];\
-    uint8_t halfHV[64];\
-    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t halfH[72];\
-    uint8_t halfHV[64];\
-    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
-}\
-void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[16*9];\
-    uint8_t halfH[72];\
-    uint8_t halfV[64];\
-    uint8_t halfHV[64];\
-    copy_block9(full, src, 16, stride, 9);\
-    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[16*9];\
-    uint8_t halfH[72];\
-    copy_block9(full, src, 16, stride, 9);\
-    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
-    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
-    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
-}\
-void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[16*9];\
-    uint8_t halfH[72];\
-    uint8_t halfV[64];\
-    uint8_t halfHV[64];\
-    copy_block9(full, src, 16, stride, 9);\
-    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[16*9];\
-    uint8_t halfH[72];\
-    copy_block9(full, src, 16, stride, 9);\
-    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
-    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
-    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
-}\
-static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t halfH[72];\
-    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
-    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
-}\
-static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## pixels16_c(dst, src, stride, 16);\
-}\
-\
-static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t half[256];\
-    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
-    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
-}\
-\
-static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
-}\
-\
-static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t half[256];\
-    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
-    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
-}\
-\
-static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[24*17];\
-    uint8_t half[256];\
-    copy_block17(full, src, 24, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
-    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
-}\
-\
-static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[24*17];\
-    copy_block17(full, src, 24, stride, 17);\
-    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
-}\
-\
-static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[24*17];\
-    uint8_t half[256];\
-    copy_block17(full, src, 24, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
-    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
-}\
-void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[24*17];\
-    uint8_t halfH[272];\
-    uint8_t halfV[256];\
-    uint8_t halfHV[256];\
-    copy_block17(full, src, 24, stride, 17);\
-    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[24*17];\
-    uint8_t halfH[272];\
-    uint8_t halfHV[256];\
-    copy_block17(full, src, 24, stride, 17);\
-    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
-}\
-void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[24*17];\
-    uint8_t halfH[272];\
-    uint8_t halfV[256];\
-    uint8_t halfHV[256];\
-    copy_block17(full, src, 24, stride, 17);\
-    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[24*17];\
-    uint8_t halfH[272];\
-    uint8_t halfHV[256];\
-    copy_block17(full, src, 24, stride, 17);\
-    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
-}\
-void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[24*17];\
-    uint8_t halfH[272];\
-    uint8_t halfV[256];\
-    uint8_t halfHV[256];\
-    copy_block17(full, src, 24, stride, 17);\
-    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[24*17];\
-    uint8_t halfH[272];\
-    uint8_t halfHV[256];\
-    copy_block17(full, src, 24, stride, 17);\
-    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
-}\
-void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[24*17];\
-    uint8_t halfH[272];\
-    uint8_t halfV[256];\
-    uint8_t halfHV[256];\
-    copy_block17(full, src, 24, stride, 17);\
-    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[24*17];\
-    uint8_t halfH[272];\
-    uint8_t halfHV[256];\
-    copy_block17(full, src, 24, stride, 17);\
-    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t halfH[272];\
-    uint8_t halfHV[256];\
-    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t halfH[272];\
-    uint8_t halfHV[256];\
-    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
-}\
-void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[24*17];\
-    uint8_t halfH[272];\
-    uint8_t halfV[256];\
-    uint8_t halfHV[256];\
-    copy_block17(full, src, 24, stride, 17);\
-    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[24*17];\
-    uint8_t halfH[272];\
-    copy_block17(full, src, 24, stride, 17);\
-    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
-    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
-}\
-void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[24*17];\
-    uint8_t halfH[272];\
-    uint8_t halfV[256];\
-    uint8_t halfHV[256];\
-    copy_block17(full, src, 24, stride, 17);\
-    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[24*17];\
-    uint8_t halfH[272];\
-    copy_block17(full, src, 24, stride, 17);\
-    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
-    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
-}\
-static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t halfH[272];\
-    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
-    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
-}
-
-#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
-#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
-#define op_put(a, b) a = cm[((b) + 16)>>5]
-#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
-
-QPEL_MC(0, put_       , _       , op_put)
-QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
-QPEL_MC(0, avg_       , _       , op_avg)
-//QPEL_MC(1, avg_no_rnd , _       , op_avg)
-#undef op_avg
-#undef op_avg_no_rnd
-#undef op_put
-#undef op_put_no_rnd
-
-#if 1
-#define H264_LOWPASS(OPNAME, OP, OP2) \
-static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    const int h=2;\
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
-    int i;\
-    for(i=0; i<h; i++)\
-    {\
-        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
-        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
-        dst+=dstStride;\
-        src+=srcStride;\
-    }\
-}\
-\
-static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    const int w=2;\
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
-    int i;\
-    for(i=0; i<w; i++)\
-    {\
-        const int srcB= src[-2*srcStride];\
-        const int srcA= src[-1*srcStride];\
-        const int src0= src[0 *srcStride];\
-        const int src1= src[1 *srcStride];\
-        const int src2= src[2 *srcStride];\
-        const int src3= src[3 *srcStride];\
-        const int src4= src[4 *srcStride];\
-        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
-        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
-        dst++;\
-        src++;\
-    }\
-}\
-\
-static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
-    const int h=2;\
-    const int w=2;\
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
-    int i;\
-    src -= 2*srcStride;\
-    for(i=0; i<h+5; i++)\
-    {\
-        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
-        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
-        tmp+=tmpStride;\
-        src+=srcStride;\
-    }\
-    tmp -= tmpStride*(h+5-2);\
-    for(i=0; i<w; i++)\
-    {\
-        const int tmpB= tmp[-2*tmpStride];\
-        const int tmpA= tmp[-1*tmpStride];\
-        const int tmp0= tmp[0 *tmpStride];\
-        const int tmp1= tmp[1 *tmpStride];\
-        const int tmp2= tmp[2 *tmpStride];\
-        const int tmp3= tmp[3 *tmpStride];\
-        const int tmp4= tmp[4 *tmpStride];\
-        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
-        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
-        dst++;\
-        tmp++;\
-    }\
-}\
-static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    const int h=4;\
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
-    int i;\
-    for(i=0; i<h; i++)\
-    {\
-        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
-        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
-        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
-        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
-        dst+=dstStride;\
-        src+=srcStride;\
-    }\
-}\
-\
-static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    const int w=4;\
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
-    int i;\
-    for(i=0; i<w; i++)\
-    {\
-        const int srcB= src[-2*srcStride];\
-        const int srcA= src[-1*srcStride];\
-        const int src0= src[0 *srcStride];\
-        const int src1= src[1 *srcStride];\
-        const int src2= src[2 *srcStride];\
-        const int src3= src[3 *srcStride];\
-        const int src4= src[4 *srcStride];\
-        const int src5= src[5 *srcStride];\
-        const int src6= src[6 *srcStride];\
-        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
-        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
-        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
-        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
-        dst++;\
-        src++;\
-    }\
-}\
-\
-static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
-    const int h=4;\
-    const int w=4;\
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
-    int i;\
-    src -= 2*srcStride;\
-    for(i=0; i<h+5; i++)\
-    {\
-        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
-        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
-        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
-        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
-        tmp+=tmpStride;\
-        src+=srcStride;\
-    }\
-    tmp -= tmpStride*(h+5-2);\
-    for(i=0; i<w; i++)\
-    {\
-        const int tmpB= tmp[-2*tmpStride];\
-        const int tmpA= tmp[-1*tmpStride];\
-        const int tmp0= tmp[0 *tmpStride];\
-        const int tmp1= tmp[1 *tmpStride];\
-        const int tmp2= tmp[2 *tmpStride];\
-        const int tmp3= tmp[3 *tmpStride];\
-        const int tmp4= tmp[4 *tmpStride];\
-        const int tmp5= tmp[5 *tmpStride];\
-        const int tmp6= tmp[6 *tmpStride];\
-        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
-        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
-        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
-        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
-        dst++;\
-        tmp++;\
-    }\
-}\
-\
-static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    const int h=8;\
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
-    int i;\
-    for(i=0; i<h; i++)\
-    {\
-        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
-        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
-        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
-        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
-        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
-        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
-        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
-        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
-        dst+=dstStride;\
-        src+=srcStride;\
-    }\
-}\
-\
-static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    const int w=8;\
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
-    int i;\
-    for(i=0; i<w; i++)\
-    {\
-        const int srcB= src[-2*srcStride];\
-        const int srcA= src[-1*srcStride];\
-        const int src0= src[0 *srcStride];\
-        const int src1= src[1 *srcStride];\
-        const int src2= src[2 *srcStride];\
-        const int src3= src[3 *srcStride];\
-        const int src4= src[4 *srcStride];\
-        const int src5= src[5 *srcStride];\
-        const int src6= src[6 *srcStride];\
-        const int src7= src[7 *srcStride];\
-        const int src8= src[8 *srcStride];\
-        const int src9= src[9 *srcStride];\
-        const int src10=src[10*srcStride];\
-        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
-        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
-        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
-        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
-        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
-        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
-        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
-        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
-        dst++;\
-        src++;\
-    }\
-}\
-\
-static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
-    const int h=8;\
-    const int w=8;\
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
-    int i;\
-    src -= 2*srcStride;\
-    for(i=0; i<h+5; i++)\
-    {\
-        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
-        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
-        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
-        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
-        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
-        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
-        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
-        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
-        tmp+=tmpStride;\
-        src+=srcStride;\
-    }\
-    tmp -= tmpStride*(h+5-2);\
-    for(i=0; i<w; i++)\
-    {\
-        const int tmpB= tmp[-2*tmpStride];\
-        const int tmpA= tmp[-1*tmpStride];\
-        const int tmp0= tmp[0 *tmpStride];\
-        const int tmp1= tmp[1 *tmpStride];\
-        const int tmp2= tmp[2 *tmpStride];\
-        const int tmp3= tmp[3 *tmpStride];\
-        const int tmp4= tmp[4 *tmpStride];\
-        const int tmp5= tmp[5 *tmpStride];\
-        const int tmp6= tmp[6 *tmpStride];\
-        const int tmp7= tmp[7 *tmpStride];\
-        const int tmp8= tmp[8 *tmpStride];\
-        const int tmp9= tmp[9 *tmpStride];\
-        const int tmp10=tmp[10*tmpStride];\
-        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
-        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
-        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
-        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
-        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
-        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
-        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
-        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
-        dst++;\
-        tmp++;\
-    }\
-}\
-\
-static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
-    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
-    src += 8*srcStride;\
-    dst += 8*dstStride;\
-    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
-    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
-}\
-\
-static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
-    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
-    src += 8*srcStride;\
-    dst += 8*dstStride;\
-    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
-    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
-}\
-\
-static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
-    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
-    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
-    src += 8*srcStride;\
-    dst += 8*dstStride;\
-    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
-    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
-}\
-
-#define H264_MC(OPNAME, SIZE) \
-static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t half[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t half[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[SIZE*(SIZE+5)];\
-    uint8_t * const full_mid= full + SIZE*2;\
-    uint8_t half[SIZE*SIZE];\
-    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
-    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
-    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[SIZE*(SIZE+5)];\
-    uint8_t * const full_mid= full + SIZE*2;\
-    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
-    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[SIZE*(SIZE+5)];\
-    uint8_t * const full_mid= full + SIZE*2;\
-    uint8_t half[SIZE*SIZE];\
-    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
-    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
-    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[SIZE*(SIZE+5)];\
-    uint8_t * const full_mid= full + SIZE*2;\
-    uint8_t halfH[SIZE*SIZE];\
-    uint8_t halfV[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
-    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
-    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
-    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[SIZE*(SIZE+5)];\
-    uint8_t * const full_mid= full + SIZE*2;\
-    uint8_t halfH[SIZE*SIZE];\
-    uint8_t halfV[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
-    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
-    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
-    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[SIZE*(SIZE+5)];\
-    uint8_t * const full_mid= full + SIZE*2;\
-    uint8_t halfH[SIZE*SIZE];\
-    uint8_t halfV[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
-    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
-    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
-    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[SIZE*(SIZE+5)];\
-    uint8_t * const full_mid= full + SIZE*2;\
-    uint8_t halfH[SIZE*SIZE];\
-    uint8_t halfV[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
-    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
-    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
-    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
-    int16_t tmp[SIZE*(SIZE+5)];\
-    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
-    int16_t tmp[SIZE*(SIZE+5)];\
-    uint8_t halfH[SIZE*SIZE];\
-    uint8_t halfHV[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
-    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
-    int16_t tmp[SIZE*(SIZE+5)];\
-    uint8_t halfH[SIZE*SIZE];\
-    uint8_t halfHV[SIZE*SIZE];\
-    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
-    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[SIZE*(SIZE+5)];\
-    uint8_t * const full_mid= full + SIZE*2;\
-    int16_t tmp[SIZE*(SIZE+5)];\
-    uint8_t halfV[SIZE*SIZE];\
-    uint8_t halfHV[SIZE*SIZE];\
-    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
-    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
-    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[SIZE*(SIZE+5)];\
-    uint8_t * const full_mid= full + SIZE*2;\
-    int16_t tmp[SIZE*(SIZE+5)];\
-    uint8_t halfV[SIZE*SIZE];\
-    uint8_t halfHV[SIZE*SIZE];\
-    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
-    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
-    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
-}\
-
-#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
-//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
-#define op_put(a, b)  a = cm[((b) + 16)>>5]
-#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
-#define op2_put(a, b)  a = cm[((b) + 512)>>10]
-
-H264_LOWPASS(put_       , op_put, op2_put)
-H264_LOWPASS(avg_       , op_avg, op2_avg)
-H264_MC(put_, 2)
-H264_MC(put_, 4)
-H264_MC(put_, 8)
-H264_MC(put_, 16)
-H264_MC(avg_, 4)
-H264_MC(avg_, 8)
-H264_MC(avg_, 16)
-
-#undef op_avg
-#undef op_put
-#undef op2_avg
-#undef op2_put
-#endif
+#include "h264dsp.h"
 
 #define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
 #define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
@@ -2667,246 +99,6 @@ H264_WEIGHT(2,2)
 #undef op_scale2
 #undef H264_WEIGHT
 
-static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-    int i;
-
-    for(i=0; i<h; i++){
-        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
-        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
-        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
-        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
-        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
-        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
-        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
-        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
-        dst+=dstStride;
-        src+=srcStride;
-    }
-}
-
-#if CONFIG_CAVS_DECODER
-/* AVS specific */
-void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
-    put_pixels8_c(dst, src, stride, 8);
-}
-void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
-    avg_pixels8_c(dst, src, stride, 8);
-}
-void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
-    put_pixels16_c(dst, src, stride, 16);
-}
-void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
-    avg_pixels16_c(dst, src, stride, 16);
-}
-#endif /* CONFIG_CAVS_DECODER */
-
-#if CONFIG_VC1_DECODER
-/* VC-1 specific */
-void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
-    put_pixels8_c(dst, src, stride, 8);
-}
-void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
-    avg_pixels8_c(dst, src, stride, 8);
-}
-#endif /* CONFIG_VC1_DECODER */
-
-/* H264 specific */
-void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
-
-#if CONFIG_RV40_DECODER
-static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
-    put_pixels16_xy2_c(dst, src, stride, 16);
-}
-static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
-    avg_pixels16_xy2_c(dst, src, stride, 16);
-}
-static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
-    put_pixels8_xy2_c(dst, src, stride, 8);
-}
-static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
-    avg_pixels8_xy2_c(dst, src, stride, 8);
-}
-#endif /* CONFIG_RV40_DECODER */
-
-static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-    int i;
-
-    for(i=0; i<w; i++){
-        const int src_1= src[ -srcStride];
-        const int src0 = src[0          ];
-        const int src1 = src[  srcStride];
-        const int src2 = src[2*srcStride];
-        const int src3 = src[3*srcStride];
-        const int src4 = src[4*srcStride];
-        const int src5 = src[5*srcStride];
-        const int src6 = src[6*srcStride];
-        const int src7 = src[7*srcStride];
-        const int src8 = src[8*srcStride];
-        const int src9 = src[9*srcStride];
-        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
-        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
-        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
-        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
-        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
-        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
-        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
-        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
-        src++;
-        dst++;
-    }
-}
-
-static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
-    put_pixels8_c(dst, src, stride, 8);
-}
-
-static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
-    uint8_t half[64];
-    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
-    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
-}
-
-static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
-    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
-}
-
-static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
-    uint8_t half[64];
-    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
-    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
-}
-
-static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
-    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
-}
-
-static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
-    uint8_t halfH[88];
-    uint8_t halfV[64];
-    uint8_t halfHV[64];
-    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
-    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
-    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
-    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
-}
-static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
-    uint8_t halfH[88];
-    uint8_t halfV[64];
-    uint8_t halfHV[64];
-    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
-    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
-    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
-    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
-}
-static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
-    uint8_t halfH[88];
-    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
-    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
-}
-
-static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
-    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
-    int x;
-    const int strength= ff_h263_loop_filter_strength[qscale];
-
-    for(x=0; x<8; x++){
-        int d1, d2, ad1;
-        int p0= src[x-2*stride];
-        int p1= src[x-1*stride];
-        int p2= src[x+0*stride];
-        int p3= src[x+1*stride];
-        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
-
-        if     (d<-2*strength) d1= 0;
-        else if(d<-  strength) d1=-2*strength - d;
-        else if(d<   strength) d1= d;
-        else if(d< 2*strength) d1= 2*strength - d;
-        else                   d1= 0;
-
-        p1 += d1;
-        p2 -= d1;
-        if(p1&256) p1= ~(p1>>31);
-        if(p2&256) p2= ~(p2>>31);
-
-        src[x-1*stride] = p1;
-        src[x+0*stride] = p2;
-
-        ad1= FFABS(d1)>>1;
-
-        d2= av_clip((p0-p3)/4, -ad1, ad1);
-
-        src[x-2*stride] = p0 - d2;
-        src[x+  stride] = p3 + d2;
-    }
-    }
-}
-
-static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
-    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
-    int y;
-    const int strength= ff_h263_loop_filter_strength[qscale];
-
-    for(y=0; y<8; y++){
-        int d1, d2, ad1;
-        int p0= src[y*stride-2];
-        int p1= src[y*stride-1];
-        int p2= src[y*stride+0];
-        int p3= src[y*stride+1];
-        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
-
-        if     (d<-2*strength) d1= 0;
-        else if(d<-  strength) d1=-2*strength - d;
-        else if(d<   strength) d1= d;
-        else if(d< 2*strength) d1= 2*strength - d;
-        else                   d1= 0;
-
-        p1 += d1;
-        p2 -= d1;
-        if(p1&256) p1= ~(p1>>31);
-        if(p2&256) p2= ~(p2>>31);
-
-        src[y*stride-1] = p1;
-        src[y*stride+0] = p2;
-
-        ad1= FFABS(d1)>>1;
-
-        d2= av_clip((p0-p3)/4, -ad1, ad1);
-
-        src[y*stride-2] = p0 - d2;
-        src[y*stride+1] = p3 + d2;
-    }
-    }
-}
-
-static void h261_loop_filter_c(uint8_t *src, int stride){
-    int x,y,xy,yz;
-    int temp[64];
-
-    for(x=0; x<8; x++){
-        temp[x      ] = 4*src[x           ];
-        temp[x + 7*8] = 4*src[x + 7*stride];
-    }
-    for(y=1; y<7; y++){
-        for(x=0; x<8; x++){
-            xy = y * stride + x;
-            yz = y * 8 + x;
-            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
-        }
-    }
-
-    for(y=0; y<8; y++){
-        src[  y*stride] = (temp[  y*8] + 2)>>2;
-        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
-        for(x=1; x<7; x++){
-            xy = y * stride + x;
-            yz = y * 8 + x;
-            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
-        }
-    }
-}
-
 static av_always_inline av_flatten void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
 {
     int i, d;
@@ -3080,1560 +272,16 @@ static void h264_h_loop_filter_chroma_in
     h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
 }
 
-static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
-{
-    int s, i;
-
-    s = 0;
-    for(i=0;i<h;i++) {
-        s += abs(pix1[0] - pix2[0]);
-        s += abs(pix1[1] - pix2[1]);
-        s += abs(pix1[2] - pix2[2]);
-        s += abs(pix1[3] - pix2[3]);
-        s += abs(pix1[4] - pix2[4]);
-        s += abs(pix1[5] - pix2[5]);
-        s += abs(pix1[6] - pix2[6]);
-        s += abs(pix1[7] - pix2[7]);
-        s += abs(pix1[8] - pix2[8]);
-        s += abs(pix1[9] - pix2[9]);
-        s += abs(pix1[10] - pix2[10]);
-        s += abs(pix1[11] - pix2[11]);
-        s += abs(pix1[12] - pix2[12]);
-        s += abs(pix1[13] - pix2[13]);
-        s += abs(pix1[14] - pix2[14]);
-        s += abs(pix1[15] - pix2[15]);
-        pix1 += line_size;
-        pix2 += line_size;
-    }
-    return s;
-}
-
-static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
-{
-    int s, i;
-
-    s = 0;
-    for(i=0;i<h;i++) {
-        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
-        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
-        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
-        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
-        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
-        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
-        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
-        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
-        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
-        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
-        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
-        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
-        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
-        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
-        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
-        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
-        pix1 += line_size;
-        pix2 += line_size;
-    }
-    return s;
-}
-
-static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
-{
-    int s, i;
-    uint8_t *pix3 = pix2 + line_size;
-
-    s = 0;
-    for(i=0;i<h;i++) {
-        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
-        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
-        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
-        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
-        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
-        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
-        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
-        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
-        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
-        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
-        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
-        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
-        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
-        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
-        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
-        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
-        pix1 += line_size;
-        pix2 += line_size;
-        pix3 += line_size;
-    }
-    return s;
-}
-
-static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
-{
-    int s, i;
-    uint8_t *pix3 = pix2 + line_size;
-
-    s = 0;
-    for(i=0;i<h;i++) {
-        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
-        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
-        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
-        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
-        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
-        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
-        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
-        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
-        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
-        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
-        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
-        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
-        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
-        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
-        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
-        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
-        pix1 += line_size;
-        pix2 += line_size;
-        pix3 += line_size;
-    }
-    return s;
-}
-
-static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
-{
-    int s, i;
-
-    s = 0;
-    for(i=0;i<h;i++) {
-        s += abs(pix1[0] - pix2[0]);
-        s += abs(pix1[1] - pix2[1]);
-        s += abs(pix1[2] - pix2[2]);
-        s += abs(pix1[3] - pix2[3]);
-        s += abs(pix1[4] - pix2[4]);
-        s += abs(pix1[5] - pix2[5]);
-        s += abs(pix1[6] - pix2[6]);
-        s += abs(pix1[7] - pix2[7]);
-        pix1 += line_size;
-        pix2 += line_size;
-    }
-    return s;
-}
-
-static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
-{
-    int s, i;
-
-    s = 0;
-    for(i=0;i<h;i++) {
-        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
-        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
-        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
-        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
-        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
-        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
-        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
-        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
-        pix1 += line_size;
-        pix2 += line_size;
-    }
-    return s;
-}
-
-static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
-{
-    int s, i;
-    uint8_t *pix3 = pix2 + line_size;
-
-    s = 0;
-    for(i=0;i<h;i++) {
-        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
-        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
-        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
-        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
-        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
-        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
-        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
-        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
-        pix1 += line_size;
-        pix2 += line_size;
-        pix3 += line_size;
-    }
-    return s;
-}
-
-static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
-{
-    int s, i;
-    uint8_t *pix3 = pix2 + line_size;
-
-    s = 0;
-    for(i=0;i<h;i++) {
-        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
-        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
-        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
-        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
-        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
-        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
-        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
-        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
-        pix1 += line_size;
-        pix2 += line_size;
-        pix3 += line_size;
-    }
-    return s;
-}
-
-static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
-    MpegEncContext *c = v;
-    int score1=0;
-    int score2=0;
-    int x,y;
-
-    for(y=0; y<h; y++){
-        for(x=0; x<16; x++){
-            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
-        }
-        if(y+1<h){
-            for(x=0; x<15; x++){
-                score2+= FFABS(  s1[x  ] - s1[x  +stride]
-                             - s1[x+1] + s1[x+1+stride])
-                        -FFABS(  s2[x  ] - s2[x  +stride]
-                             - s2[x+1] + s2[x+1+stride]);
-            }
-        }
-        s1+= stride;
-        s2+= stride;
-    }
-
-    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
-    else  return score1 + FFABS(score2)*8;
-}
-
-static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
-    MpegEncContext *c = v;
-    int score1=0;
-    int score2=0;
-    int x,y;
-
-    for(y=0; y<h; y++){
-        for(x=0; x<8; x++){
-            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
-        }
-        if(y+1<h){
-            for(x=0; x<7; x++){
-                score2+= FFABS(  s1[x  ] - s1[x  +stride]
-                             - s1[x+1] + s1[x+1+stride])
-                        -FFABS(  s2[x  ] - s2[x  +stride]
-                             - s2[x+1] + s2[x+1+stride]);
-            }
-        }
-        s1+= stride;
-        s2+= stride;
-    }
-
-    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
-    else  return score1 + FFABS(score2)*8;
-}
-
-static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
-    int i;
-    unsigned int sum=0;
-
-    for(i=0; i<8*8; i++){
-        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
-        int w= weight[i];
-        b>>= RECON_SHIFT;
-        assert(-512<b && b<512);
-
-        sum += (w*b)*(w*b)>>4;
-    }
-    return sum>>2;
-}
-
-static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
-    int i;
-
-    for(i=0; i<8*8; i++){
-        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
-    }
-}
-
-/**
- * permutes an 8x8 block.
- * @param block the block which will be permuted according to the given permutation vector
- * @param permutation the permutation vector
- * @param last the last non zero coefficient in scantable order, used to speed the permutation up
- * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
- *                  (inverse) permutated to scantable order!
- */
-void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
-{
-    int i;
-    DCTELEM temp[64];
-
-    if(last<=0) return;
-    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
-
-    for(i=0; i<=last; i++){
-        const int j= scantable[i];
-        temp[j]= block[j];
-        block[j]=0;
-    }
-
-    for(i=0; i<=last; i++){
-        const int j= scantable[i];
-        const int perm_j= permutation[j];
-        block[perm_j]= temp[j];
-    }
-}
-
-static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
-    return 0;
-}
-
-void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
-    int i;
-
-    memset(cmp, 0, sizeof(void*)*6);
-
-    for(i=0; i<6; i++){
-        switch(type&0xFF){
-        case FF_CMP_SAD:
-            cmp[i]= c->sad[i];
-            break;
-        case FF_CMP_SATD:
-            cmp[i]= c->hadamard8_diff[i];
-            break;
-        case FF_CMP_SSE:
-            cmp[i]= c->sse[i];
-            break;
-        case FF_CMP_DCT:
-            cmp[i]= c->dct_sad[i];
-            break;
-        case FF_CMP_DCT264:
-            cmp[i]= c->dct264_sad[i];
-            break;
-        case FF_CMP_DCTMAX:
-            cmp[i]= c->dct_max[i];
-            break;
-        case FF_CMP_PSNR:
-            cmp[i]= c->quant_psnr[i];
-            break;
-        case FF_CMP_BIT:
-            cmp[i]= c->bit[i];
-            break;
-        case FF_CMP_RD:
-            cmp[i]= c->rd[i];
-            break;
-        case FF_CMP_VSAD:
-            cmp[i]= c->vsad[i];
-            break;
-        case FF_CMP_VSSE:
-            cmp[i]= c->vsse[i];
-            break;
-        case FF_CMP_ZERO:
-            cmp[i]= zero_cmp;
-            break;
-        case FF_CMP_NSSE:
-            cmp[i]= c->nsse[i];
-            break;
-#if CONFIG_DWT
-        case FF_CMP_W53:
-            cmp[i]= c->w53[i];
-            break;
-        case FF_CMP_W97:
-            cmp[i]= c->w97[i];
-            break;
-#endif
-        default:
-            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
-        }
-    }
-}
-
-static void clear_block_c(DCTELEM *block)
-{
-    memset(block, 0, sizeof(DCTELEM)*64);
-}
-
-/**
- * memset(blocks, 0, sizeof(DCTELEM)*6*64)
- */
-static void clear_blocks_c(DCTELEM *blocks)
-{
-    memset(blocks, 0, sizeof(DCTELEM)*6*64);
-}
-
-static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
-    long i;
-    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
-        long a = *(long*)(src+i);
-        long b = *(long*)(dst+i);
-        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
-    }
-    for(; i<w; i++)
-        dst[i+0] += src[i+0];
-}
-
-static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
-    long i;
-    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
-        long a = *(long*)(src1+i);
-        long b = *(long*)(src2+i);
-        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
-    }
-    for(; i<w; i++)
-        dst[i] = src1[i]+src2[i];
-}
-
-static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
-    long i;
-#if !HAVE_FAST_UNALIGNED
-    if((long)src2 & (sizeof(long)-1)){
-        for(i=0; i+7<w; i+=8){
-            dst[i+0] = src1[i+0]-src2[i+0];
-            dst[i+1] = src1[i+1]-src2[i+1];
-            dst[i+2] = src1[i+2]-src2[i+2];
-            dst[i+3] = src1[i+3]-src2[i+3];
-            dst[i+4] = src1[i+4]-src2[i+4];
-            dst[i+5] = src1[i+5]-src2[i+5];
-            dst[i+6] = src1[i+6]-src2[i+6];
-            dst[i+7] = src1[i+7]-src2[i+7];
-        }
-    }else
-#endif
-    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
-        long a = *(long*)(src1+i);
-        long b = *(long*)(src2+i);
-        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
-    }
-    for(; i<w; i++)
-        dst[i+0] = src1[i+0]-src2[i+0];
-}
-
-static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
-    int i;
-    uint8_t l, lt;
-
-    l= *left;
-    lt= *left_top;
-
-    for(i=0; i<w; i++){
-        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
-        lt= src1[i];
-        dst[i]= l;
-    }
-
-    *left= l;
-    *left_top= lt;
-}
-
-static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
-    int i;
-    uint8_t l, lt;
-
-    l= *left;
-    lt= *left_top;
-
-    for(i=0; i<w; i++){
-        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
-        lt= src1[i];
-        l= src2[i];
-        dst[i]= l - pred;
-    }
-
-    *left= l;
-    *left_top= lt;
-}
-
-static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
-    int i;
-
-    for(i=0; i<w-1; i++){
-        acc+= src[i];
-        dst[i]= acc;
-        i++;
-        acc+= src[i];
-        dst[i]= acc;
-    }
-
-    for(; i<w; i++){
-        acc+= src[i];
-        dst[i]= acc;
-    }
-
-    return acc;
-}
-
-#if HAVE_BIGENDIAN
-#define B 3
-#define G 2
-#define R 1
-#define A 0
-#else
-#define B 0
-#define G 1
-#define R 2
-#define A 3
-#endif
-static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
-    int i;
-    int r,g,b,a;
-    r= *red;
-    g= *green;
-    b= *blue;
-    a= *alpha;
-
-    for(i=0; i<w; i++){
-        b+= src[4*i+B];
-        g+= src[4*i+G];
-        r+= src[4*i+R];
-        a+= src[4*i+A];
-
-        dst[4*i+B]= b;
-        dst[4*i+G]= g;
-        dst[4*i+R]= r;
-        dst[4*i+A]= a;
-    }
-
-    *red= r;
-    *green= g;
-    *blue= b;
-    *alpha= a;
-}
-#undef B
-#undef G
-#undef R
-#undef A
-
-#define BUTTERFLY2(o1,o2,i1,i2) \
-o1= (i1)+(i2);\
-o2= (i1)-(i2);
-
-#define BUTTERFLY1(x,y) \
-{\
-    int a,b;\
-    a= x;\
-    b= y;\
-    x= a+b;\
-    y= a-b;\
-}
-
-#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
-
-static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
-    int i;
-    int temp[64];
-    int sum=0;
-
-    assert(h==8);
-
-    for(i=0; i<8; i++){
-        //FIXME try pointer walks
-        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
-        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
-        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
-        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
-
-        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
-        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
-        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
-        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
-
-        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
-        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
-        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
-        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
-    }
-
-    for(i=0; i<8; i++){
-        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
-        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
-        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
-        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
-
-        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
-        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
-        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
-        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
-
-        sum +=
-             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
-            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
-            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
-            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
-    }
-#if 0
-static int maxi=0;
-if(sum>maxi){
-    maxi=sum;
-    printf("MAX:%d\n", maxi);
-}
-#endif
-    return sum;
-}
-
-static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
-    int i;
-    int temp[64];
-    int sum=0;
-
-    assert(h==8);
-
-    for(i=0; i<8; i++){
-        //FIXME try pointer walks
-        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
-        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
-        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
-        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
-
-        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
-        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
-        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
-        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
-
-        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
-        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
-        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
-        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
-    }
-
-    for(i=0; i<8; i++){
-        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
-        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
-        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
-        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
-
-        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
-        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
-        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
-        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
-
-        sum +=
-             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
-            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
-            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
-            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
-    }
-
-    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
-
-    return sum;
-}
-
-static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
-    MpegEncContext * const s= (MpegEncContext *)c;
-    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
-
-    assert(h==8);
-
-    s->dsp.diff_pixels(temp, src1, src2, stride);
-    s->dsp.fdct(temp);
-    return s->dsp.sum_abs_dctelem(temp);
-}
-
-#if CONFIG_GPL
-#define DCT8_1D {\
-    const int s07 = SRC(0) + SRC(7);\
-    const int s16 = SRC(1) + SRC(6);\
-    const int s25 = SRC(2) + SRC(5);\
-    const int s34 = SRC(3) + SRC(4);\
-    const int a0 = s07 + s34;\
-    const int a1 = s16 + s25;\
-    const int a2 = s07 - s34;\
-    const int a3 = s16 - s25;\
-    const int d07 = SRC(0) - SRC(7);\
-    const int d16 = SRC(1) - SRC(6);\
-    const int d25 = SRC(2) - SRC(5);\
-    const int d34 = SRC(3) - SRC(4);\
-    const int a4 = d16 + d25 + (d07 + (d07>>1));\
-    const int a5 = d07 - d34 - (d25 + (d25>>1));\
-    const int a6 = d07 + d34 - (d16 + (d16>>1));\
-    const int a7 = d16 - d25 + (d34 + (d34>>1));\
-    DST(0,  a0 + a1     ) ;\
-    DST(1,  a4 + (a7>>2)) ;\
-    DST(2,  a2 + (a3>>1)) ;\
-    DST(3,  a5 + (a6>>2)) ;\
-    DST(4,  a0 - a1     ) ;\
-    DST(5,  a6 - (a5>>2)) ;\
-    DST(6, (a2>>1) - a3 ) ;\
-    DST(7, (a4>>2) - a7 ) ;\
-}
-
-static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
-    MpegEncContext * const s= (MpegEncContext *)c;
-    DCTELEM dct[8][8];
-    int i;
-    int sum=0;
-
-    s->dsp.diff_pixels(dct[0], src1, src2, stride);
-
-#define SRC(x) dct[i][x]
-#define DST(x,v) dct[i][x]= v
-    for( i = 0; i < 8; i++ )
-        DCT8_1D
-#undef SRC
-#undef DST
-
-#define SRC(x) dct[x][i]
-#define DST(x,v) sum += FFABS(v)
-    for( i = 0; i < 8; i++ )
-        DCT8_1D
-#undef SRC
-#undef DST
-    return sum;
-}
-#endif
-
-static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
-    MpegEncContext * const s= (MpegEncContext *)c;
-    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
-    int sum=0, i;
-
-    assert(h==8);
-
-    s->dsp.diff_pixels(temp, src1, src2, stride);
-    s->dsp.fdct(temp);
-
-    for(i=0; i<64; i++)
-        sum= FFMAX(sum, FFABS(temp[i]));
-
-    return sum;
-}
-
-static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
-    MpegEncContext * const s= (MpegEncContext *)c;
-    LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
-    DCTELEM * const bak = temp+64;
-    int sum=0, i;
-
-    assert(h==8);
-    s->mb_intra=0;
-
-    s->dsp.diff_pixels(temp, src1, src2, stride);
-
-    memcpy(bak, temp, 64*sizeof(DCTELEM));
-
-    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
-    s->dct_unquantize_inter(s, temp, 0, s->qscale);
-    ff_simple_idct(temp); //FIXME
-
-    for(i=0; i<64; i++)
-        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
-
-    return sum;
-}
-
-static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
-    MpegEncContext * const s= (MpegEncContext *)c;
-    const uint8_t *scantable= s->intra_scantable.permutated;
-    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
-    LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
-    LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
-    int i, last, run, bits, level, distortion, start_i;
-    const int esc_length= s->ac_esc_length;
-    uint8_t * length;
-    uint8_t * last_length;
-
-    assert(h==8);
-
-    copy_block8(lsrc1, src1, 8, stride, 8);
-    copy_block8(lsrc2, src2, 8, stride, 8);
-
-    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
-
-    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
-
-    bits=0;
-
-    if (s->mb_intra) {
-        start_i = 1;
-        length     = s->intra_ac_vlc_length;
-        last_length= s->intra_ac_vlc_last_length;
-        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
-    } else {
-        start_i = 0;
-        length     = s->inter_ac_vlc_length;
-        last_length= s->inter_ac_vlc_last_length;
-    }
-
-    if(last>=start_i){
-        run=0;
-        for(i=start_i; i<last; i++){
-            int j= scantable[i];
-            level= temp[j];
-
-            if(level){
-                level+=64;
-                if((level&(~127)) == 0){
-                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
-                }else
-                    bits+= esc_length;
-                run=0;
-            }else
-                run++;
-        }
-        i= scantable[last];
-
-        level= temp[i] + 64;
-
-        assert(level - 64);
-
-        if((level&(~127)) == 0){
-            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
-        }else
-            bits+= esc_length;
-
-    }
-
-    if(last>=0){
-        if(s->mb_intra)
-            s->dct_unquantize_intra(s, temp, 0, s->qscale);
-        else
-            s->dct_unquantize_inter(s, temp, 0, s->qscale);
-    }
-
-    s->dsp.idct_add(lsrc2, 8, temp);
-
-    distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
-
-    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
-}
-
-static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
-    MpegEncContext * const s= (MpegEncContext *)c;
-    const uint8_t *scantable= s->intra_scantable.permutated;
-    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
-    int i, last, run, bits, level, start_i;
-    const int esc_length= s->ac_esc_length;
-    uint8_t * length;
-    uint8_t * last_length;
-
-    assert(h==8);
-
-    s->dsp.diff_pixels(temp, src1, src2, stride);
-
-    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
-
-    bits=0;
-
-    if (s->mb_intra) {
-        start_i = 1;
-        length     = s->intra_ac_vlc_length;
-        last_length= s->intra_ac_vlc_last_length;
-        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
-    } else {
-        start_i = 0;
-        length     = s->inter_ac_vlc_length;
-        last_length= s->inter_ac_vlc_last_length;
-    }
-
-    if(last>=start_i){
-        run=0;
-        for(i=start_i; i<last; i++){
-            int j= scantable[i];
-            level= temp[j];
-
-            if(level){
-                level+=64;
-                if((level&(~127)) == 0){
-                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
-                }else
-                    bits+= esc_length;
-                run=0;
-            }else
-                run++;
-        }
-        i= scantable[last];
-
-        level= temp[i] + 64;
-
-        assert(level - 64);
-
-        if((level&(~127)) == 0){
-            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
-        }else
-            bits+= esc_length;
-    }
-
-    return bits;
-}
-
-#define VSAD_INTRA(size) \
-static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
-    int score=0;                                                                                            \
-    int x,y;                                                                                                \
-                                                                                                            \
-    for(y=1; y<h; y++){                                                                                     \
-        for(x=0; x<size; x+=4){                                                                             \
-            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
-                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
-        }                                                                                                   \
-        s+= stride;                                                                                         \
-    }                                                                                                       \
-                                                                                                            \
-    return score;                                                                                           \
-}
-VSAD_INTRA(8)
-VSAD_INTRA(16)
-
-static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
-    int score=0;
-    int x,y;
-
-    for(y=1; y<h; y++){
-        for(x=0; x<16; x++){
-            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
-        }
-        s1+= stride;
-        s2+= stride;
-    }
-
-    return score;
-}
-
-#define SQ(a) ((a)*(a))
-#define VSSE_INTRA(size) \
-static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
-    int score=0;                                                                                            \
-    int x,y;                                                                                                \
-                                                                                                            \
-    for(y=1; y<h; y++){                                                                                     \
-        for(x=0; x<size; x+=4){                                                                               \
-            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
-                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
-        }                                                                                                   \
-        s+= stride;                                                                                         \
-    }                                                                                                       \
-                                                                                                            \
-    return score;                                                                                           \
-}
-VSSE_INTRA(8)
-VSSE_INTRA(16)
-
-static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
-    int score=0;
-    int x,y;
-
-    for(y=1; y<h; y++){
-        for(x=0; x<16; x++){
-            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
-        }
-        s1+= stride;
-        s2+= stride;
-    }
-
-    return score;
-}
-
-static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
-                               int size){
-    int score=0;
-    int i;
-    for(i=0; i<size; i++)
-        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
-    return score;
-}
-
-WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
-WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
-WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
-#if CONFIG_GPL
-WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
-#endif
-WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
-WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
-WRAPPER8_16_SQ(rd8x8_c, rd16_c)
-WRAPPER8_16_SQ(bit8x8_c, bit16_c)
-
-static void vector_fmul_c(float *dst, const float *src, int len){
-    int i;
-    for(i=0; i<len; i++)
-        dst[i] *= src[i];
-}
-
-static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
-    int i;
-    src1 += len-1;
-    for(i=0; i<len; i++)
-        dst[i] = src0[i] * src1[-i];
-}
-
-static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
-    int i;
-    for(i=0; i<len; i++)
-        dst[i] = src0[i] * src1[i] + src2[i];
-}
-
-void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
-    int i,j;
-    dst += len;
-    win += len;
-    src0+= len;
-    for(i=-len, j=len-1; i<0; i++, j--) {
-        float s0 = src0[i];
-        float s1 = src1[j];
-        float wi = win[i];
-        float wj = win[j];
-        dst[i] = s0*wj - s1*wi + add_bias;
-        dst[j] = s0*wi + s1*wj + add_bias;
-    }
-}
-
-static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
-                                 int len)
-{
-    int i;
-    for (i = 0; i < len; i++)
-        dst[i] = src[i] * mul;
-}
-
-static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
-                                      const float **sv, float mul, int len)
-{
-    int i;
-    for (i = 0; i < len; i += 2, sv++) {
-        dst[i  ] = src[i  ] * sv[0][0] * mul;
-        dst[i+1] = src[i+1] * sv[0][1] * mul;
-    }
-}
-
-static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
-                                      const float **sv, float mul, int len)
-{
-    int i;
-    for (i = 0; i < len; i += 4, sv++) {
-        dst[i  ] = src[i  ] * sv[0][0] * mul;
-        dst[i+1] = src[i+1] * sv[0][1] * mul;
-        dst[i+2] = src[i+2] * sv[0][2] * mul;
-        dst[i+3] = src[i+3] * sv[0][3] * mul;
-    }
-}
-
-static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
-                               int len)
-{
-    int i;
-    for (i = 0; i < len; i += 2, sv++) {
-        dst[i  ] = sv[0][0] * mul;
-        dst[i+1] = sv[0][1] * mul;
-    }
-}
-
-static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
-                               int len)
-{
-    int i;
-    for (i = 0; i < len; i += 4, sv++) {
-        dst[i  ] = sv[0][0] * mul;
-        dst[i+1] = sv[0][1] * mul;
-        dst[i+2] = sv[0][2] * mul;
-        dst[i+3] = sv[0][3] * mul;
-    }
-}
-
-static void butterflies_float_c(float *restrict v1, float *restrict v2,
-                                int len)
-{
-    int i;
-    for (i = 0; i < len; i++) {
-        float t = v1[i] - v2[i];
-        v1[i] += v2[i];
-        v2[i] = t;
-    }
-}
-
-static float scalarproduct_float_c(const float *v1, const float *v2, int len)
-{
-    float p = 0.0;
-    int i;
-
-    for (i = 0; i < len; i++)
-        p += v1[i] * v2[i];
-
-    return p;
-}
-
-static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
-    int i;
-    for(i=0; i<len; i++)
-        dst[i] = src[i] * mul;
-}
-
-static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
-                   uint32_t maxi, uint32_t maxisign)
-{
-
-    if(a > mini) return mini;
-    else if((a^(1<<31)) > maxisign) return maxi;
-    else return a;
-}
-
-static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
-    int i;
-    uint32_t mini = *(uint32_t*)min;
-    uint32_t maxi = *(uint32_t*)max;
-    uint32_t maxisign = maxi ^ (1<<31);
-    uint32_t *dsti = (uint32_t*)dst;
-    const uint32_t *srci = (const uint32_t*)src;
-    for(i=0; i<len; i+=8) {
-        dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
-        dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
-        dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
-        dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
-        dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
-        dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
-        dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
-        dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
-    }
-}
-static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
-    int i;
-    if(min < 0 && max > 0) {
-        vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
-    } else {
-        for(i=0; i < len; i+=8) {
-            dst[i    ] = av_clipf(src[i    ], min, max);
-            dst[i + 1] = av_clipf(src[i + 1], min, max);
-            dst[i + 2] = av_clipf(src[i + 2], min, max);
-            dst[i + 3] = av_clipf(src[i + 3], min, max);
-            dst[i + 4] = av_clipf(src[i + 4], min, max);
-            dst[i + 5] = av_clipf(src[i + 5], min, max);
-            dst[i + 6] = av_clipf(src[i + 6], min, max);
-            dst[i + 7] = av_clipf(src[i + 7], min, max);
-        }
-    }
-}
-
-static av_always_inline int float_to_int16_one(const float *src){
-    int_fast32_t tmp = *(const int32_t*)src;
-    if(tmp & 0xf0000){
-        tmp = (0x43c0ffff - tmp)>>31;
-        // is this faster on some gcc/cpu combinations?
-//      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
-//      else                 tmp = 0;
-    }
-    return tmp - 0x8000;
-}
-
-void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
-    int i;
-    for(i=0; i<len; i++)
-        dst[i] = float_to_int16_one(src+i);
-}
-
-void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
-    int i,j,c;
-    if(channels==2){
-        for(i=0; i<len; i++){
-            dst[2*i]   = float_to_int16_one(src[0]+i);
-            dst[2*i+1] = float_to_int16_one(src[1]+i);
-        }
-    }else{
-        for(c=0; c<channels; c++)
-            for(i=0, j=c; i<len; i++, j+=channels)
-                dst[j] = float_to_int16_one(src[c]+i);
-    }
-}
-
-static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
-{
-    int res = 0;
-
-    while (order--)
-        res += (*v1++ * *v2++) >> shift;
-
-    return res;
-}
-
-static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
-{
-    int res = 0;
-    while (order--) {
-        res   += *v1 * *v2++;
-        *v1++ += mul * *v3++;
-    }
-    return res;
-}
-
-#define W0 2048
-#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
-#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
-#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
-#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
-#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
-#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
-#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
-
-static void wmv2_idct_row(short * b)
-{
-    int s1,s2;
-    int a0,a1,a2,a3,a4,a5,a6,a7;
-    /*step 1*/
-    a1 = W1*b[1]+W7*b[7];
-    a7 = W7*b[1]-W1*b[7];
-    a5 = W5*b[5]+W3*b[3];
-    a3 = W3*b[5]-W5*b[3];
-    a2 = W2*b[2]+W6*b[6];
-    a6 = W6*b[2]-W2*b[6];
-    a0 = W0*b[0]+W0*b[4];
-    a4 = W0*b[0]-W0*b[4];
-    /*step 2*/
-    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
-    s2 = (181*(a1-a5-a7+a3)+128)>>8;
-    /*step 3*/
-    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
-    b[1] = (a4+a6 +s1   + (1<<7))>>8;
-    b[2] = (a4-a6 +s2   + (1<<7))>>8;
-    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
-    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
-    b[5] = (a4-a6 -s2   + (1<<7))>>8;
-    b[6] = (a4+a6 -s1   + (1<<7))>>8;
-    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
-}
-static void wmv2_idct_col(short * b)
-{
-    int s1,s2;
-    int a0,a1,a2,a3,a4,a5,a6,a7;
-    /*step 1, with extended precision*/
-    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
-    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
-    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
-    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
-    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
-    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
-    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
-    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
-    /*step 2*/
-    s1 = (181*(a1-a5+a7-a3)+128)>>8;
-    s2 = (181*(a1-a5-a7+a3)+128)>>8;
-    /*step 3*/
-    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
-    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
-    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
-    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
-
-    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
-    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
-    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
-    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
-}
-void ff_wmv2_idct_c(short * block){
-    int i;
-
-    for(i=0;i<64;i+=8){
-        wmv2_idct_row(block+i);
-    }
-    for(i=0;i<8;i++){
-        wmv2_idct_col(block+i);
-    }
-}
-/* XXX: those functions should be suppressed ASAP when all IDCTs are
- converted */
-static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    ff_wmv2_idct_c(block);
-    put_pixels_clamped_c(block, dest, line_size);
-}
-static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    ff_wmv2_idct_c(block);
-    add_pixels_clamped_c(block, dest, line_size);
-}
-static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    j_rev_dct (block);
-    put_pixels_clamped_c(block, dest, line_size);
-}
-static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    j_rev_dct (block);
-    add_pixels_clamped_c(block, dest, line_size);
-}
-
-static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    j_rev_dct4 (block);
-    put_pixels_clamped4_c(block, dest, line_size);
-}
-static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    j_rev_dct4 (block);
-    add_pixels_clamped4_c(block, dest, line_size);
-}
-
-static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    j_rev_dct2 (block);
-    put_pixels_clamped2_c(block, dest, line_size);
-}
-static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    j_rev_dct2 (block);
-    add_pixels_clamped2_c(block, dest, line_size);
-}
-
-static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-
-    dest[0] = cm[(block[0] + 4)>>3];
-}
-static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-
-    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
-}
-
-static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
-
-/* init static data */
-av_cold void dsputil_static_init(void)
-{
-    int i;
-
-    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
-    for(i=0;i<MAX_NEG_CROP;i++) {
-        ff_cropTbl[i] = 0;
-        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
-    }
-
-    for(i=0;i<512;i++) {
-        ff_squareTbl[i] = (i - 256) * (i - 256);
-    }
-
-    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
-}
-
-int ff_check_alignment(void){
-    static int did_fail=0;
-    DECLARE_ALIGNED(16, int, aligned);
-
-    if((intptr_t)&aligned & 15){
-        if(!did_fail){
-#if HAVE_MMX || HAVE_ALTIVEC
-            av_log(NULL, AV_LOG_ERROR,
-                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
-                "and may be very slow or crash. This is not a bug in libavcodec,\n"
-                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
-                "Do not report crashes to FFmpeg developers.\n");
-#endif
-            did_fail=1;
-        }
-        return -1;
-    }
-    return 0;
-}
-
-av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
+void ff_h264dsp_init(H264DSPContext *c)
 {
-    int i;
-
-    ff_check_alignment();
-
-#if CONFIG_ENCODERS
-    if(avctx->dct_algo==FF_DCT_FASTINT) {
-        c->fdct = fdct_ifast;
-        c->fdct248 = fdct_ifast248;
-    }
-    else if(avctx->dct_algo==FF_DCT_FAAN) {
-        c->fdct = ff_faandct;
-        c->fdct248 = ff_faandct248;
-    }
-    else {
-        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
-        c->fdct248 = ff_fdct248_islow;
-    }
-#endif //CONFIG_ENCODERS
-
-    if(avctx->lowres==1){
-        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
-            c->idct_put= ff_jref_idct4_put;
-            c->idct_add= ff_jref_idct4_add;
-        }else{
-            c->idct_put= ff_h264_lowres_idct_put_c;
-            c->idct_add= ff_h264_lowres_idct_add_c;
-        }
-        c->idct    = j_rev_dct4;
-        c->idct_permutation_type= FF_NO_IDCT_PERM;
-    }else if(avctx->lowres==2){
-        c->idct_put= ff_jref_idct2_put;
-        c->idct_add= ff_jref_idct2_add;
-        c->idct    = j_rev_dct2;
-        c->idct_permutation_type= FF_NO_IDCT_PERM;
-    }else if(avctx->lowres==3){
-        c->idct_put= ff_jref_idct1_put;
-        c->idct_add= ff_jref_idct1_add;
-        c->idct    = j_rev_dct1;
-        c->idct_permutation_type= FF_NO_IDCT_PERM;
-    }else{
-        if(avctx->idct_algo==FF_IDCT_INT){
-            c->idct_put= ff_jref_idct_put;
-            c->idct_add= ff_jref_idct_add;
-            c->idct    = j_rev_dct;
-            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
-        }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
-                avctx->idct_algo==FF_IDCT_VP3){
-            c->idct_put= ff_vp3_idct_put_c;
-            c->idct_add= ff_vp3_idct_add_c;
-            c->idct    = ff_vp3_idct_c;
-            c->idct_permutation_type= FF_NO_IDCT_PERM;
-        }else if(avctx->idct_algo==FF_IDCT_WMV2){
-            c->idct_put= ff_wmv2_idct_put_c;
-            c->idct_add= ff_wmv2_idct_add_c;
-            c->idct    = ff_wmv2_idct_c;
-            c->idct_permutation_type= FF_NO_IDCT_PERM;
-        }else if(avctx->idct_algo==FF_IDCT_FAAN){
-            c->idct_put= ff_faanidct_put;
-            c->idct_add= ff_faanidct_add;
-            c->idct    = ff_faanidct;
-            c->idct_permutation_type= FF_NO_IDCT_PERM;
-        }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
-            c->idct_put= ff_ea_idct_put_c;
-            c->idct_permutation_type= FF_NO_IDCT_PERM;
-        }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
-            c->idct     = ff_bink_idct_c;
-            c->idct_add = ff_bink_idct_add_c;
-            c->idct_put = ff_bink_idct_put_c;
-            c->idct_permutation_type = FF_NO_IDCT_PERM;
-        }else{ //accurate/default
-            c->idct_put= ff_simple_idct_put;
-            c->idct_add= ff_simple_idct_add;
-            c->idct    = ff_simple_idct;
-            c->idct_permutation_type= FF_NO_IDCT_PERM;
-        }
-    }
-
-    if (CONFIG_H264_DECODER) {
-        c->h264_idct_add= ff_h264_idct_add_c;
-        c->h264_idct8_add= ff_h264_idct8_add_c;
-        c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
-        c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
-        c->h264_idct_add16     = ff_h264_idct_add16_c;
-        c->h264_idct8_add4     = ff_h264_idct8_add4_c;
-        c->h264_idct_add8      = ff_h264_idct_add8_c;
-        c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
-    }
-
-    c->get_pixels = get_pixels_c;
-    c->diff_pixels = diff_pixels_c;
-    c->put_pixels_clamped = put_pixels_clamped_c;
-    c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
-    c->put_pixels_nonclamped = put_pixels_nonclamped_c;
-    c->add_pixels_clamped = add_pixels_clamped_c;
-    c->add_pixels8 = add_pixels8_c;
-    c->add_pixels4 = add_pixels4_c;
-    c->sum_abs_dctelem = sum_abs_dctelem_c;
-    c->gmc1 = gmc1_c;
-    c->gmc = ff_gmc_c;
-    c->clear_block = clear_block_c;
-    c->clear_blocks = clear_blocks_c;
-    c->pix_sum = pix_sum_c;
-    c->pix_norm1 = pix_norm1_c;
-
-    c->fill_block_tab[0] = fill_block16_c;
-    c->fill_block_tab[1] = fill_block8_c;
-    c->scale_block = scale_block_c;
-
-    /* TODO [0] 16  [1] 8 */
-    c->pix_abs[0][0] = pix_abs16_c;
-    c->pix_abs[0][1] = pix_abs16_x2_c;
-    c->pix_abs[0][2] = pix_abs16_y2_c;
-    c->pix_abs[0][3] = pix_abs16_xy2_c;
-    c->pix_abs[1][0] = pix_abs8_c;
-    c->pix_abs[1][1] = pix_abs8_x2_c;
-    c->pix_abs[1][2] = pix_abs8_y2_c;
-    c->pix_abs[1][3] = pix_abs8_xy2_c;
-
-#define dspfunc(PFX, IDX, NUM) \
-    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
-    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
-    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
-    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
-
-    dspfunc(put, 0, 16);
-    dspfunc(put_no_rnd, 0, 16);
-    dspfunc(put, 1, 8);
-    dspfunc(put_no_rnd, 1, 8);
-    dspfunc(put, 2, 4);
-    dspfunc(put, 3, 2);
-
-    dspfunc(avg, 0, 16);
-    dspfunc(avg_no_rnd, 0, 16);
-    dspfunc(avg, 1, 8);
-    dspfunc(avg_no_rnd, 1, 8);
-    dspfunc(avg, 2, 4);
-    dspfunc(avg, 3, 2);
-#undef dspfunc
-
-    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
-    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
-
-    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
-    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
-    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
-    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
-    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
-    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
-    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
-    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
-    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
-
-    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
-    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
-    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
-    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
-    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
-    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
-    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
-    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
-    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
-
-#define dspfunc(PFX, IDX, NUM) \
-    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
-    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
-    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
-    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
-    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
-    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
-    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
-    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
-    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
-    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
-    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
-    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
-    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
-    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
-    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
-    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
-
-    dspfunc(put_qpel, 0, 16);
-    dspfunc(put_no_rnd_qpel, 0, 16);
-
-    dspfunc(avg_qpel, 0, 16);
-    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
-
-    dspfunc(put_qpel, 1, 8);
-    dspfunc(put_no_rnd_qpel, 1, 8);
-
-    dspfunc(avg_qpel, 1, 8);
-    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
-
-    dspfunc(put_h264_qpel, 0, 16);
-    dspfunc(put_h264_qpel, 1, 8);
-    dspfunc(put_h264_qpel, 2, 4);
-    dspfunc(put_h264_qpel, 3, 2);
-    dspfunc(avg_h264_qpel, 0, 16);
-    dspfunc(avg_h264_qpel, 1, 8);
-    dspfunc(avg_h264_qpel, 2, 4);
-
-#undef dspfunc
-    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
-    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
-    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
-    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
-    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
-    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
-    c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
-    c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
+    c->h264_idct_add= ff_h264_idct_add_c;
+    c->h264_idct8_add= ff_h264_idct8_add_c;
+    c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
+    c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
+    c->h264_idct_add16     = ff_h264_idct_add16_c;
+    c->h264_idct8_add4     = ff_h264_idct8_add4_c;
+    c->h264_idct_add8      = ff_h264_idct_add8_c;
+    c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
 
     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
@@ -4656,87 +304,6 @@ av_cold void dsputil_init(DSPContext* c,
     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
 
-    c->draw_edges = draw_edges_c;
-
-#if CONFIG_CAVS_DECODER
-    ff_cavsdsp_init(c,avctx);
-#endif
-
-#if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
-    ff_mlp_init(c, avctx);
-#endif
-#if CONFIG_VC1_DECODER
-    ff_vc1dsp_init(c,avctx);
-#endif
-#if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
-    ff_intrax8dsp_init(c,avctx);
-#endif
-#if CONFIG_RV30_DECODER
-    ff_rv30dsp_init(c,avctx);
-#endif
-#if CONFIG_RV40_DECODER
-    ff_rv40dsp_init(c,avctx);
-    c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
-    c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
-    c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
-    c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
-#endif
-
-    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
-    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
-    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
-    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
-    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
-    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
-    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
-    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
-
-#define SET_CMP_FUNC(name) \
-    c->name[0]= name ## 16_c;\
-    c->name[1]= name ## 8x8_c;
-
-    SET_CMP_FUNC(hadamard8_diff)
-    c->hadamard8_diff[4]= hadamard8_intra16_c;
-    c->hadamard8_diff[5]= hadamard8_intra8x8_c;
-    SET_CMP_FUNC(dct_sad)
-    SET_CMP_FUNC(dct_max)
-#if CONFIG_GPL
-    SET_CMP_FUNC(dct264_sad)
-#endif
-    c->sad[0]= pix_abs16_c;
-    c->sad[1]= pix_abs8_c;
-    c->sse[0]= sse16_c;
-    c->sse[1]= sse8_c;
-    c->sse[2]= sse4_c;
-    SET_CMP_FUNC(quant_psnr)
-    SET_CMP_FUNC(rd)
-    SET_CMP_FUNC(bit)
-    c->vsad[0]= vsad16_c;
-    c->vsad[4]= vsad_intra16_c;
-    c->vsad[5]= vsad_intra8_c;
-    c->vsse[0]= vsse16_c;
-    c->vsse[4]= vsse_intra16_c;
-    c->vsse[5]= vsse_intra8_c;
-    c->nsse[0]= nsse16_c;
-    c->nsse[1]= nsse8_c;
-#if CONFIG_DWT
-    ff_dsputil_init_dwt(c);
-#endif
-
-    c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
-
-    c->add_bytes= add_bytes_c;
-    c->add_bytes_l2= add_bytes_l2_c;
-    c->diff_bytes= diff_bytes_c;
-    c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
-    c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
-    c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
-    c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
-    c->bswap_buf= bswap_buf;
-#if CONFIG_PNG_DECODER
-    c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
-#endif
-
     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
     c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
@@ -4747,107 +314,7 @@ av_cold void dsputil_init(DSPContext* c,
     c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
     c->h264_loop_filter_strength= NULL;
 
-    if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
-        c->h263_h_loop_filter= h263_h_loop_filter_c;
-        c->h263_v_loop_filter= h263_v_loop_filter_c;
-    }
-
-    if (CONFIG_VP3_DECODER) {
-        c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
-        c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
-    }
-    if (CONFIG_VP6_DECODER) {
-        c->vp6_filter_diag4= ff_vp6_filter_diag4_c;
-    }
-
-    c->h261_loop_filter= h261_loop_filter_c;
-
-    c->try_8x8basis= try_8x8basis_c;
-    c->add_8x8basis= add_8x8basis_c;
-
-#if CONFIG_VORBIS_DECODER
-    c->vorbis_inverse_coupling = vorbis_inverse_coupling;
-#endif
-#if CONFIG_AC3_DECODER
-    c->ac3_downmix = ff_ac3_downmix_c;
-#endif
-#if CONFIG_LPC
-    c->lpc_compute_autocorr = ff_lpc_compute_autocorr;
-#endif
-    c->vector_fmul = vector_fmul_c;
-    c->vector_fmul_reverse = vector_fmul_reverse_c;
-    c->vector_fmul_add = vector_fmul_add_c;
-    c->vector_fmul_window = ff_vector_fmul_window_c;
-    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
-    c->vector_clipf = vector_clipf_c;
-    c->float_to_int16 = ff_float_to_int16_c;
-    c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
-    c->scalarproduct_int16 = scalarproduct_int16_c;
-    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
-    c->scalarproduct_float = scalarproduct_float_c;
-    c->butterflies_float = butterflies_float_c;
-    c->vector_fmul_scalar = vector_fmul_scalar_c;
-
-    c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
-    c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
-
-    c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
-    c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
-
-    c->shrink[0]= ff_img_copy_plane;
-    c->shrink[1]= ff_shrink22;
-    c->shrink[2]= ff_shrink44;
-    c->shrink[3]= ff_shrink88;
-
-    c->prefetch= just_return;
-
-    memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
-    memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
-
-    if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
-    if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
-    if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
-    if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
-    if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
-    if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
-    if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
-    if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
-    if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
-
-    for(i=0; i<64; i++){
-        if(!c->put_2tap_qpel_pixels_tab[0][i])
-            c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
-        if(!c->avg_2tap_qpel_pixels_tab[0][i])
-            c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
-    }
-
-    switch(c->idct_permutation_type){
-    case FF_NO_IDCT_PERM:
-        for(i=0; i<64; i++)
-            c->idct_permutation[i]= i;
-        break;
-    case FF_LIBMPEG2_IDCT_PERM:
-        for(i=0; i<64; i++)
-            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
-        break;
-    case FF_SIMPLE_IDCT_PERM:
-        for(i=0; i<64; i++)
-            c->idct_permutation[i]= simple_mmx_permutation[i];
-        break;
-    case FF_TRANSPOSE_IDCT_PERM:
-        for(i=0; i<64; i++)
-            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
-        break;
-    case FF_PARTTRANS_IDCT_PERM:
-        for(i=0; i<64; i++)
-            c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
-        break;
-    case FF_SSE2_IDCT_PERM:
-        for(i=0; i<64; i++)
-            c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
-        break;
-    default:
-        av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
-    }
+    if (ARCH_ARM) ff_h264dsp_init_arm(c);
+    if (ARCH_PPC) ff_h264dsp_init_ppc(c);
+    if (HAVE_MMX) ff_h264dsp_init_x86(c);
 }
-

Copied and modified: trunk/libavcodec/h264dsp.h (from r22564, trunk/libavcodec/dsputil.h)
==============================================================================
--- trunk/libavcodec/dsputil.h	Tue Mar 16 00:40:51 2010	(r22564, copy source)
+++ trunk/libavcodec/h264dsp.h	Tue Mar 16 02:17:00 2010	(r22565)
@@ -1,7 +1,5 @@
 /*
- * DSP utils
- * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni at gmx.at>
+ * Copyright (c) 2003-2010 Michael Niedermayer <michaelni at gmx.at>
  *
  * This file is part of FFmpeg.
  *
@@ -21,355 +19,30 @@
  */
 
 /**
- * @file libavcodec/dsputil.h
- * DSP utils.
- * note, many functions in here may use MMX which trashes the FPU state, it is
- * absolutely necessary to call emms_c() between dsp & float/double code
+ * @file libavcodec/h264dsp.h
+ * H.264 DSP functions.
+ * @author Michael Niedermayer <michaelni at gmx.at>
  */
 
-#ifndef AVCODEC_DSPUTIL_H
-#define AVCODEC_DSPUTIL_H
-
-#include "libavutil/intreadwrite.h"
-#include "avcodec.h"
-
-
-//#define DEBUG
-/* dct code */
-typedef short DCTELEM;
-
-void fdct_ifast (DCTELEM *data);
-void fdct_ifast248 (DCTELEM *data);
-void ff_jpeg_fdct_islow (DCTELEM *data);
-void ff_fdct248_islow (DCTELEM *data);
-
-void j_rev_dct (DCTELEM *data);
-void j_rev_dct4 (DCTELEM *data);
-void j_rev_dct2 (DCTELEM *data);
-void j_rev_dct1 (DCTELEM *data);
-void ff_wmv2_idct_c(DCTELEM *data);
-
-void ff_fdct_mmx(DCTELEM *block);
-void ff_fdct_mmx2(DCTELEM *block);
-void ff_fdct_sse2(DCTELEM *block);
-
-void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride);
-void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride);
-void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
-void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
-void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block);
-void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block);
-void ff_h264_idct_add16_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
-void ff_h264_idct_add16intra_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
-void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
-void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
-
-void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1,
-                             const float *win, float add_bias, int len);
-void ff_float_to_int16_c(int16_t *dst, const float *src, long len);
-void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels);
-
-/* encoding scans */
-extern const uint8_t ff_alternate_horizontal_scan[64];
-extern const uint8_t ff_alternate_vertical_scan[64];
-extern const uint8_t ff_zigzag_direct[64];
-extern const uint8_t ff_zigzag248_direct[64];
-
-/* pixel operations */
-#define MAX_NEG_CROP 1024
-
-/* temporary */
-extern uint32_t ff_squareTbl[512];
-extern uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP];
-
-/* VP3 DSP functions */
-void ff_vp3_idct_c(DCTELEM *block/* align 16*/);
-void ff_vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
-void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
-
-void ff_vp3_v_loop_filter_c(uint8_t *src, int stride, int *bounding_values);
-void ff_vp3_h_loop_filter_c(uint8_t *src, int stride, int *bounding_values);
-
-/* VP6 DSP functions */
-void ff_vp6_filter_diag4_c(uint8_t *dst, uint8_t *src, int stride,
-                           const int16_t *h_weights, const int16_t *v_weights);
-
-/* Bink functions */
-void ff_bink_idct_c    (DCTELEM *block);
-void ff_bink_idct_add_c(uint8_t *dest, int linesize, DCTELEM *block);
-void ff_bink_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
-
-/* CAVS functions */
-void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride);
-void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride);
-void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride);
-void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride);
-
-/* VC1 functions */
-void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd);
-void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd);
-
-/* EA functions */
-void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
-
-/* 1/2^n downscaling functions from imgconvert.c */
-void ff_img_copy_plane(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
-void ff_shrink22(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
-void ff_shrink44(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
-void ff_shrink88(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
-
-void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
-              int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
-
-/* minimum alignment rules ;)
-If you notice errors in the align stuff, need more alignment for some ASM code
-for some CPU or need to use a function with less aligned data then send a mail
-to the ffmpeg-devel mailing list, ...
-
-!warning These alignments might not match reality, (missing attribute((align))
-stuff somewhere possible).
-I (Michael) did not check them, these are just the alignments which I think
-could be reached easily ...
-
-!future video codecs might need functions with less strict alignment
-*/
+#ifndef AVCODEC_H264DSP_H
+#define AVCODEC_H264DSP_H
 
-/*
-void get_pixels_c(DCTELEM *block, const uint8_t *pixels, int line_size);
-void diff_pixels_c(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride);
-void put_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size);
-void add_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size);
-void clear_blocks_c(DCTELEM *blocks);
-*/
+#include <stdint.h>
+#include "dsputil.h"
 
-/* add and put pixel (decoding) */
-// blocksizes for op_pixels_func are 8x4,8x8 16x8 16x16
-//h for op_pixels_func is limited to {width/2, width} but never larger than 16 and never smaller then 4
-typedef void (*op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int h);
-typedef void (*tpel_mc_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int w, int h);
-typedef void (*qpel_mc_func)(uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
-typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
+//typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
 typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset);
 typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset);
 
-typedef void (*op_fill_func)(uint8_t *block/*align width (8 or 16)*/, uint8_t value, int line_size, int h);
-
-#define DEF_OLD_QPEL(name)\
-void ff_put_        ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\
-void ff_put_no_rnd_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\
-void ff_avg_        ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
-
-DEF_OLD_QPEL(qpel16_mc11_old_c)
-DEF_OLD_QPEL(qpel16_mc31_old_c)
-DEF_OLD_QPEL(qpel16_mc12_old_c)
-DEF_OLD_QPEL(qpel16_mc32_old_c)
-DEF_OLD_QPEL(qpel16_mc13_old_c)
-DEF_OLD_QPEL(qpel16_mc33_old_c)
-DEF_OLD_QPEL(qpel8_mc11_old_c)
-DEF_OLD_QPEL(qpel8_mc31_old_c)
-DEF_OLD_QPEL(qpel8_mc12_old_c)
-DEF_OLD_QPEL(qpel8_mc32_old_c)
-DEF_OLD_QPEL(qpel8_mc13_old_c)
-DEF_OLD_QPEL(qpel8_mc33_old_c)
-
-#define CALL_2X_PIXELS(a, b, n)\
-static void a(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
-    b(block  , pixels  , line_size, h);\
-    b(block+n, pixels+n, line_size, h);\
-}
-
-/* motion estimation */
-// h is limited to {width/2, width, 2*width} but never larger than 16 and never smaller then 2
-// although currently h<4 is not used as functions with width <8 are neither used nor implemented
-typedef int (*me_cmp_func)(void /*MpegEncContext*/ *s, uint8_t *blk1/*align width (8 or 16)*/, uint8_t *blk2/*align 1*/, int line_size, int h)/* __attribute__ ((const))*/;
-
-/**
- * Scantable.
- */
-typedef struct ScanTable{
-    const uint8_t *scantable;
-    uint8_t permutated[64];
-    uint8_t raster_end[64];
-#if ARCH_PPC
-                /** Used by dct_quantize_altivec to find last-non-zero */
-    DECLARE_ALIGNED(16, uint8_t, inverse)[64];
-#endif
-} ScanTable;
-
-void ff_init_scantable(uint8_t *, ScanTable *st, const uint8_t *src_scantable);
-
-void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize,
-                         int block_w, int block_h,
-                         int src_x, int src_y, int w, int h);
-
 /**
- * DSPContext.
+ * Context for storing H.264 DSP functions
  */
-typedef struct DSPContext {
-    /* pixel ops : interface with DCT */
-    void (*get_pixels)(DCTELEM *block/*align 16*/, const uint8_t *pixels/*align 8*/, int line_size);
-    void (*diff_pixels)(DCTELEM *block/*align 16*/, const uint8_t *s1/*align 8*/, const uint8_t *s2/*align 8*/, int stride);
-    void (*put_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
-    void (*put_signed_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
-    void (*put_pixels_nonclamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
-    void (*add_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
-    void (*add_pixels8)(uint8_t *pixels, DCTELEM *block, int line_size);
-    void (*add_pixels4)(uint8_t *pixels, DCTELEM *block, int line_size);
-    int (*sum_abs_dctelem)(DCTELEM *block/*align 16*/);
-    /**
-     * translational global motion compensation.
-     */
-    void (*gmc1)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x16, int y16, int rounder);
-    /**
-     * global motion compensation.
-     */
-    void (*gmc )(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int ox, int oy,
-                    int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
-    void (*clear_block)(DCTELEM *block/*align 16*/);
-    void (*clear_blocks)(DCTELEM *blocks/*align 16*/);
-    int (*pix_sum)(uint8_t * pix, int line_size);
-    int (*pix_norm1)(uint8_t * pix, int line_size);
-// 16x16 8x8 4x4 2x2 16x8 8x4 4x2 8x16 4x8 2x4
-
-    me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */
-    me_cmp_func sse[6];
-    me_cmp_func hadamard8_diff[6];
-    me_cmp_func dct_sad[6];
-    me_cmp_func quant_psnr[6];
-    me_cmp_func bit[6];
-    me_cmp_func rd[6];
-    me_cmp_func vsad[6];
-    me_cmp_func vsse[6];
-    me_cmp_func nsse[6];
-    me_cmp_func w53[6];
-    me_cmp_func w97[6];
-    me_cmp_func dct_max[6];
-    me_cmp_func dct264_sad[6];
-
-    me_cmp_func me_pre_cmp[6];
-    me_cmp_func me_cmp[6];
-    me_cmp_func me_sub_cmp[6];
-    me_cmp_func mb_cmp[6];
-    me_cmp_func ildct_cmp[6]; //only width 16 used
-    me_cmp_func frame_skip_cmp[6]; //only width 8 used
-
-    int (*ssd_int8_vs_int16)(const int8_t *pix1, const int16_t *pix2,
-                             int size);
-
-    /**
-     * Halfpel motion compensation with rounding (a+b+1)>>1.
-     * this is an array[4][4] of motion compensation functions for 4
-     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
-     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
-     * @param block destination where the result is stored
-     * @param pixels source
-     * @param line_size number of bytes in a horizontal line of block
-     * @param h height
-     */
-    op_pixels_func put_pixels_tab[4][4];
-
-    /**
-     * Halfpel motion compensation with rounding (a+b+1)>>1.
-     * This is an array[4][4] of motion compensation functions for 4
-     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
-     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
-     * @param block destination into which the result is averaged (a+b+1)>>1
-     * @param pixels source
-     * @param line_size number of bytes in a horizontal line of block
-     * @param h height
-     */
-    op_pixels_func avg_pixels_tab[4][4];
-
-    /**
-     * Halfpel motion compensation with no rounding (a+b)>>1.
-     * this is an array[2][4] of motion compensation functions for 2
-     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
-     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
-     * @param block destination where the result is stored
-     * @param pixels source
-     * @param line_size number of bytes in a horizontal line of block
-     * @param h height
-     */
-    op_pixels_func put_no_rnd_pixels_tab[4][4];
-
-    /**
-     * Halfpel motion compensation with no rounding (a+b)>>1.
-     * this is an array[2][4] of motion compensation functions for 2
-     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
-     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
-     * @param block destination into which the result is averaged (a+b)>>1
-     * @param pixels source
-     * @param line_size number of bytes in a horizontal line of block
-     * @param h height
-     */
-    op_pixels_func avg_no_rnd_pixels_tab[4][4];
-
-    void (*put_no_rnd_pixels_l2[2])(uint8_t *block/*align width (8 or 16)*/, const uint8_t *a/*align 1*/, const uint8_t *b/*align 1*/, int line_size, int h);
-
-    /**
-     * Thirdpel motion compensation with rounding (a+b+1)>>1.
-     * this is an array[12] of motion compensation functions for the 9 thirdpe
-     * positions<br>
-     * *pixels_tab[ xthirdpel + 4*ythirdpel ]
-     * @param block destination where the result is stored
-     * @param pixels source
-     * @param line_size number of bytes in a horizontal line of block
-     * @param h height
-     */
-    tpel_mc_func put_tpel_pixels_tab[11]; //FIXME individual func ptr per width?
-    tpel_mc_func avg_tpel_pixels_tab[11]; //FIXME individual func ptr per width?
-
-    qpel_mc_func put_qpel_pixels_tab[2][16];
-    qpel_mc_func avg_qpel_pixels_tab[2][16];
-    qpel_mc_func put_no_rnd_qpel_pixels_tab[2][16];
-    qpel_mc_func avg_no_rnd_qpel_pixels_tab[2][16];
-    qpel_mc_func put_mspel_pixels_tab[8];
-
-    /**
-     * h264 Chroma MC
-     */
-    h264_chroma_mc_func put_h264_chroma_pixels_tab[3];
-    h264_chroma_mc_func avg_h264_chroma_pixels_tab[3];
-    /* This is really one func used in VC-1 decoding */
-    h264_chroma_mc_func put_no_rnd_vc1_chroma_pixels_tab[3];
-    h264_chroma_mc_func avg_no_rnd_vc1_chroma_pixels_tab[3];
-
-    qpel_mc_func put_h264_qpel_pixels_tab[4][16];
-    qpel_mc_func avg_h264_qpel_pixels_tab[4][16];
-
-    qpel_mc_func put_2tap_qpel_pixels_tab[4][16];
-    qpel_mc_func avg_2tap_qpel_pixels_tab[4][16];
-
+typedef struct H264DSPContext{
+    /* weighted MC */
     h264_weight_func weight_h264_pixels_tab[10];
     h264_biweight_func biweight_h264_pixels_tab[10];
 
-    /* AVS specific */
-    qpel_mc_func put_cavs_qpel_pixels_tab[2][16];
-    qpel_mc_func avg_cavs_qpel_pixels_tab[2][16];
-    void (*cavs_filter_lv)(uint8_t *pix, int stride, int alpha, int beta, int tc, int bs1, int bs2);
-    void (*cavs_filter_lh)(uint8_t *pix, int stride, int alpha, int beta, int tc, int bs1, int bs2);
-    void (*cavs_filter_cv)(uint8_t *pix, int stride, int alpha, int beta, int tc, int bs1, int bs2);
-    void (*cavs_filter_ch)(uint8_t *pix, int stride, int alpha, int beta, int tc, int bs1, int bs2);
-    void (*cavs_idct8_add)(uint8_t *dst, DCTELEM *block, int stride);
-
-    me_cmp_func pix_abs[2][4];
-
-    /* huffyuv specific */
-    void (*add_bytes)(uint8_t *dst/*align 16*/, uint8_t *src/*align 16*/, int w);
-    void (*add_bytes_l2)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 16*/, int w);
-    void (*diff_bytes)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 1*/,int w);
-    /**
-     * subtract huffyuv's variant of median prediction
-     * note, this might read from src1[-1], src2[-1]
-     */
-    void (*sub_hfyu_median_prediction)(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top);
-    void (*add_hfyu_median_prediction)(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
-    int  (*add_hfyu_left_prediction)(uint8_t *dst, const uint8_t *src, int w, int left);
-    void (*add_hfyu_left_prediction_bgr32)(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha);
-    /* this might write to dst[w] */
-    void (*add_png_paeth_prediction)(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
-    void (*bswap_buf)(uint32_t *dst, const uint32_t *src, int w);
-
+    /* loop filter */
     void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);
     void (*h264_h_loop_filter_luma)(uint8_t *pix/*align 4 */, int stride, int alpha, int beta, int8_t *tc0);
     /* v/h_loop_filter_luma_intra: align 16 */
@@ -383,141 +56,7 @@ typedef struct DSPContext {
     void (*h264_loop_filter_strength)(int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
                                       int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field);
 
-    void (*h263_v_loop_filter)(uint8_t *src, int stride, int qscale);
-    void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale);
-
-    void (*h261_loop_filter)(uint8_t *src, int stride);
-
-    void (*x8_v_loop_filter)(uint8_t *src, int stride, int qscale);
-    void (*x8_h_loop_filter)(uint8_t *src, int stride, int qscale);
-
-    void (*vp3_v_loop_filter)(uint8_t *src, int stride, int *bounding_values);
-    void (*vp3_h_loop_filter)(uint8_t *src, int stride, int *bounding_values);
-
-    void (*vp6_filter_diag4)(uint8_t *dst, uint8_t *src, int stride,
-                             const int16_t *h_weights,const int16_t *v_weights);
-
-    /* assume len is a multiple of 4, and arrays are 16-byte aligned */
-    void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize);
-    void (*ac3_downmix)(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
-    /* no alignment needed */
-    void (*lpc_compute_autocorr)(const int32_t *data, int len, int lag, double *autoc);
-    /* assume len is a multiple of 8, and arrays are 16-byte aligned */
-    void (*vector_fmul)(float *dst, const float *src, int len);
-    void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len);
-    /* assume len is a multiple of 8, and src arrays are 16-byte aligned */
-    void (*vector_fmul_add)(float *dst, const float *src0, const float *src1, const float *src2, int len);
-    /* assume len is a multiple of 4, and arrays are 16-byte aligned */
-    void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len);
-    /* assume len is a multiple of 8, and arrays are 16-byte aligned */
-    void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
-    void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */);
-    /**
-     * Multiply a vector of floats by a scalar float.  Source and
-     * destination vectors must overlap exactly or not at all.
-     * @param dst result vector, 16-byte aligned
-     * @param src input vector, 16-byte aligned
-     * @param mul scalar value
-     * @param len length of vector, multiple of 4
-     */
-    void (*vector_fmul_scalar)(float *dst, const float *src, float mul,
-                               int len);
-    /**
-     * Multiply a vector of floats by concatenated short vectors of
-     * floats and by a scalar float.  Source and destination vectors
-     * must overlap exactly or not at all.
-     * [0]: short vectors of length 2, 8-byte aligned
-     * [1]: short vectors of length 4, 16-byte aligned
-     * @param dst output vector, 16-byte aligned
-     * @param src input vector, 16-byte aligned
-     * @param sv  array of pointers to short vectors
-     * @param mul scalar value
-     * @param len number of elements in src and dst, multiple of 4
-     */
-    void (*vector_fmul_sv_scalar[2])(float *dst, const float *src,
-                                     const float **sv, float mul, int len);
-    /**
-     * Multiply short vectors of floats by a scalar float, store
-     * concatenated result.
-     * [0]: short vectors of length 2, 8-byte aligned
-     * [1]: short vectors of length 4, 16-byte aligned
-     * @param dst output vector, 16-byte aligned
-     * @param sv  array of pointers to short vectors
-     * @param mul scalar value
-     * @param len number of output elements, multiple of 4
-     */
-    void (*sv_fmul_scalar[2])(float *dst, const float **sv,
-                              float mul, int len);
-    /**
-     * Calculate the scalar product of two vectors of floats.
-     * @param v1  first vector, 16-byte aligned
-     * @param v2  second vector, 16-byte aligned
-     * @param len length of vectors, multiple of 4
-     */
-    float (*scalarproduct_float)(const float *v1, const float *v2, int len);
-    /**
-     * Calculate the sum and difference of two vectors of floats.
-     * @param v1  first input vector, sum output, 16-byte aligned
-     * @param v2  second input vector, difference output, 16-byte aligned
-     * @param len length of vectors, multiple of 4
-     */
-    void (*butterflies_float)(float *restrict v1, float *restrict v2, int len);
-
-    /* C version: convert floats from the range [384.0,386.0] to ints in [-32768,32767]
-     * simd versions: convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */
-    void (*float_to_int16)(int16_t *dst, const float *src, long len);
-    void (*float_to_int16_interleave)(int16_t *dst, const float **src, long len, int channels);
-
-    /* (I)DCT */
-    void (*fdct)(DCTELEM *block/* align 16*/);
-    void (*fdct248)(DCTELEM *block/* align 16*/);
-
-    /* IDCT really*/
-    void (*idct)(DCTELEM *block/* align 16*/);
-
-    /**
-     * block -> idct -> clip to unsigned 8 bit -> dest.
-     * (-1392, 0, 0, ...) -> idct -> (-174, -174, ...) -> put -> (0, 0, ...)
-     * @param line_size size in bytes of a horizontal line of dest
-     */
-    void (*idct_put)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
-
-    /**
-     * block -> idct -> add dest -> clip to unsigned 8 bit -> dest.
-     * @param line_size size in bytes of a horizontal line of dest
-     */
-    void (*idct_add)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
-
-    /**
-     * idct input permutation.
-     * several optimized IDCTs need a permutated input (relative to the normal order of the reference
-     * IDCT)
-     * this permutation must be performed before the idct_put/add, note, normally this can be merged
-     * with the zigzag/alternate scan<br>
-     * an example to avoid confusion:
-     * - (->decode coeffs -> zigzag reorder -> dequant -> reference idct ->...)
-     * - (x -> referece dct -> reference idct -> x)
-     * - (x -> referece dct -> simple_mmx_perm = idct_permutation -> simple_idct_mmx -> x)
-     * - (->decode coeffs -> zigzag reorder -> simple_mmx_perm -> dequant -> simple_idct_mmx ->...)
-     */
-    uint8_t idct_permutation[64];
-    int idct_permutation_type;
-#define FF_NO_IDCT_PERM 1
-#define FF_LIBMPEG2_IDCT_PERM 2
-#define FF_SIMPLE_IDCT_PERM 3
-#define FF_TRANSPOSE_IDCT_PERM 4
-#define FF_PARTTRANS_IDCT_PERM 5
-#define FF_SSE2_IDCT_PERM 6
-
-    int (*try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale);
-    void (*add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale);
-#define BASIS_SHIFT 16
-#define RECON_SHIFT 6
-
-    void (*draw_edges)(uint8_t *buf, int wrap, int width, int height, int w);
-#define EDGE_WIDTH 16
-
-    /* h264 functions */
+    /* IDCT */
     /* NOTE!!! if you implement any of h264_idct8_add, h264_idct8_add4 then you must implement all of them
        NOTE!!! if you implement any of h264_idct_add, h264_idct_add16, h264_idct_add16intra, h264_idct_add8 then you must implement all of them
         The reason for above, is that no 2 out of one list may use a different permutation.
@@ -531,311 +70,11 @@ typedef struct DSPContext {
     void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
     void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
     void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
+}H264DSPContext;
 
-    void (*prefetch)(void *mem, int stride, int h);
-
-    void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
-
-    /* mlp/truehd functions */
-    void (*mlp_filter_channel)(int32_t *state, const int32_t *coeff,
-                               int firorder, int iirorder,
-                               unsigned int filter_shift, int32_t mask, int blocksize,
-                               int32_t *sample_buffer);
-
-    /* vc1 functions */
-    void (*vc1_inv_trans_8x8)(DCTELEM *b);
-    void (*vc1_inv_trans_8x4)(uint8_t *dest, int line_size, DCTELEM *block);
-    void (*vc1_inv_trans_4x8)(uint8_t *dest, int line_size, DCTELEM *block);
-    void (*vc1_inv_trans_4x4)(uint8_t *dest, int line_size, DCTELEM *block);
-    void (*vc1_inv_trans_8x8_dc)(uint8_t *dest, int line_size, DCTELEM *block);
-    void (*vc1_inv_trans_8x4_dc)(uint8_t *dest, int line_size, DCTELEM *block);
-    void (*vc1_inv_trans_4x8_dc)(uint8_t *dest, int line_size, DCTELEM *block);
-    void (*vc1_inv_trans_4x4_dc)(uint8_t *dest, int line_size, DCTELEM *block);
-    void (*vc1_v_overlap)(uint8_t* src, int stride);
-    void (*vc1_h_overlap)(uint8_t* src, int stride);
-    void (*vc1_v_loop_filter4)(uint8_t *src, int stride, int pq);
-    void (*vc1_h_loop_filter4)(uint8_t *src, int stride, int pq);
-    void (*vc1_v_loop_filter8)(uint8_t *src, int stride, int pq);
-    void (*vc1_h_loop_filter8)(uint8_t *src, int stride, int pq);
-    void (*vc1_v_loop_filter16)(uint8_t *src, int stride, int pq);
-    void (*vc1_h_loop_filter16)(uint8_t *src, int stride, int pq);
-    /* put 8x8 block with bicubic interpolation and quarterpel precision
-     * last argument is actually round value instead of height
-     */
-    op_pixels_func put_vc1_mspel_pixels_tab[16];
-    op_pixels_func avg_vc1_mspel_pixels_tab[16];
-
-    /* intrax8 functions */
-    void (*x8_spatial_compensation[12])(uint8_t *src , uint8_t *dst, int linesize);
-    void (*x8_setup_spatial_compensation)(uint8_t *src, uint8_t *dst, int linesize,
-           int * range, int * sum,  int edges);
-
-    /**
-     * Calculate scalar product of two vectors.
-     * @param len length of vectors, should be multiple of 16
-     * @param shift number of bits to discard from product
-     */
-    int32_t (*scalarproduct_int16)(int16_t *v1, int16_t *v2/*align 16*/, int len, int shift);
-    /* ape functions */
-    /**
-     * Calculate scalar product of v1 and v2,
-     * and v1[i] += v3[i] * mul
-     * @param len length of vectors, should be multiple of 16
-     */
-    int32_t (*scalarproduct_and_madd_int16)(int16_t *v1/*align 16*/, int16_t *v2, int16_t *v3, int len, int mul);
-
-    /* rv30 functions */
-    qpel_mc_func put_rv30_tpel_pixels_tab[4][16];
-    qpel_mc_func avg_rv30_tpel_pixels_tab[4][16];
-
-    /* rv40 functions */
-    qpel_mc_func put_rv40_qpel_pixels_tab[4][16];
-    qpel_mc_func avg_rv40_qpel_pixels_tab[4][16];
-    h264_chroma_mc_func put_rv40_chroma_pixels_tab[3];
-    h264_chroma_mc_func avg_rv40_chroma_pixels_tab[3];
-
-    /* bink functions */
-    op_fill_func fill_block_tab[2];
-    void (*scale_block)(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize);
-} DSPContext;
-
-void dsputil_static_init(void);
-void dsputil_init(DSPContext* p, AVCodecContext *avctx);
-
-int ff_check_alignment(void);
-
-/**
- * permute block according to permuatation.
- * @param last last non zero element in scantable order
- */
-void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last);
-
-void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type);
-
-#define         BYTE_VEC32(c)   ((c)*0x01010101UL)
-
-static inline uint32_t rnd_avg32(uint32_t a, uint32_t b)
-{
-    return (a | b) - (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1);
-}
-
-static inline uint32_t no_rnd_avg32(uint32_t a, uint32_t b)
-{
-    return (a & b) + (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1);
-}
-
-static inline int get_penalty_factor(int lambda, int lambda2, int type){
-    switch(type&0xFF){
-    default:
-    case FF_CMP_SAD:
-        return lambda>>FF_LAMBDA_SHIFT;
-    case FF_CMP_DCT:
-        return (3*lambda)>>(FF_LAMBDA_SHIFT+1);
-    case FF_CMP_W53:
-        return (4*lambda)>>(FF_LAMBDA_SHIFT);
-    case FF_CMP_W97:
-        return (2*lambda)>>(FF_LAMBDA_SHIFT);
-    case FF_CMP_SATD:
-    case FF_CMP_DCT264:
-        return (2*lambda)>>FF_LAMBDA_SHIFT;
-    case FF_CMP_RD:
-    case FF_CMP_PSNR:
-    case FF_CMP_SSE:
-    case FF_CMP_NSSE:
-        return lambda2>>FF_LAMBDA_SHIFT;
-    case FF_CMP_BIT:
-        return 1;
-    }
-}
-
-/**
- * Empty mmx state.
- * this must be called between any dsp function and float/double code.
- * for example sin(); dsp->idct_put(); emms_c(); cos()
- */
-#define emms_c()
-
-/* should be defined by architectures supporting
-   one or more MultiMedia extension */
-int mm_support(void);
-extern int mm_flags;
-
-void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx);
-void dsputil_init_arm(DSPContext* c, AVCodecContext *avctx);
-void dsputil_init_bfin(DSPContext* c, AVCodecContext *avctx);
-void dsputil_init_mlib(DSPContext* c, AVCodecContext *avctx);
-void dsputil_init_mmi(DSPContext* c, AVCodecContext *avctx);
-void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx);
-void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx);
-void dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx);
-void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx);
-
-void ff_dsputil_init_dwt(DSPContext *c);
-void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
-void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
-void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
-void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
-void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
-void ff_mlp_init(DSPContext* c, AVCodecContext *avctx);
-void ff_mlp_init_x86(DSPContext* c, AVCodecContext *avctx);
-
-#if HAVE_MMX
-
-#undef emms_c
-
-static inline void emms(void)
-{
-    __asm__ volatile ("emms;":::"memory");
-}
-
-
-#define emms_c() \
-{\
-    if (mm_flags & FF_MM_MMX)\
-        emms();\
-}
-
-#elif ARCH_ARM
-
-#if HAVE_NEON
-#   define STRIDE_ALIGN 16
-#endif
-
-#elif ARCH_PPC
-
-#define STRIDE_ALIGN 16
-
-#elif HAVE_MMI
-
-#define STRIDE_ALIGN 16
-
-#else
-
-#define mm_flags 0
-#define mm_support() 0
-
-#endif
-
-#ifndef STRIDE_ALIGN
-#   define STRIDE_ALIGN 8
-#endif
-
-#define LOCAL_ALIGNED(a, t, v, s, ...)                          \
-    uint8_t la_##v[sizeof(t s __VA_ARGS__) + (a)];              \
-    t (*v) __VA_ARGS__ = (void *)FFALIGN((uintptr_t)la_##v, a)
-
-#if HAVE_LOCAL_ALIGNED_8
-#   define LOCAL_ALIGNED_8(t, v, s, ...) DECLARE_ALIGNED(8, t, v) s __VA_ARGS__
-#else
-#   define LOCAL_ALIGNED_8(t, v, s, ...) LOCAL_ALIGNED(8, t, v, s, __VA_ARGS__)
-#endif
-
-#if HAVE_LOCAL_ALIGNED_16
-#   define LOCAL_ALIGNED_16(t, v, s, ...) DECLARE_ALIGNED(16, t, v) s __VA_ARGS__
-#else
-#   define LOCAL_ALIGNED_16(t, v, s, ...) LOCAL_ALIGNED(16, t, v, s, __VA_ARGS__)
-#endif
-
-/* PSNR */
-void get_psnr(uint8_t *orig_image[3], uint8_t *coded_image[3],
-              int orig_linesize[3], int coded_linesize,
-              AVCodecContext *avctx);
-
-#define WRAPPER8_16(name8, name16)\
-static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
-    return name8(s, dst           , src           , stride, h)\
-          +name8(s, dst+8         , src+8         , stride, h);\
-}
-
-#define WRAPPER8_16_SQ(name8, name16)\
-static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
-    int score=0;\
-    score +=name8(s, dst           , src           , stride, 8);\
-    score +=name8(s, dst+8         , src+8         , stride, 8);\
-    if(h==16){\
-        dst += 8*stride;\
-        src += 8*stride;\
-        score +=name8(s, dst           , src           , stride, 8);\
-        score +=name8(s, dst+8         , src+8         , stride, 8);\
-    }\
-    return score;\
-}
-
-
-static inline void copy_block2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
-{
-    int i;
-    for(i=0; i<h; i++)
-    {
-        AV_WN16(dst   , AV_RN16(src   ));
-        dst+=dstStride;
-        src+=srcStride;
-    }
-}
-
-static inline void copy_block4(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
-{
-    int i;
-    for(i=0; i<h; i++)
-    {
-        AV_WN32(dst   , AV_RN32(src   ));
-        dst+=dstStride;
-        src+=srcStride;
-    }
-}
-
-static inline void copy_block8(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
-{
-    int i;
-    for(i=0; i<h; i++)
-    {
-        AV_WN32(dst   , AV_RN32(src   ));
-        AV_WN32(dst+4 , AV_RN32(src+4 ));
-        dst+=dstStride;
-        src+=srcStride;
-    }
-}
-
-static inline void copy_block9(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
-{
-    int i;
-    for(i=0; i<h; i++)
-    {
-        AV_WN32(dst   , AV_RN32(src   ));
-        AV_WN32(dst+4 , AV_RN32(src+4 ));
-        dst[8]= src[8];
-        dst+=dstStride;
-        src+=srcStride;
-    }
-}
-
-static inline void copy_block16(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
-{
-    int i;
-    for(i=0; i<h; i++)
-    {
-        AV_WN32(dst   , AV_RN32(src   ));
-        AV_WN32(dst+4 , AV_RN32(src+4 ));
-        AV_WN32(dst+8 , AV_RN32(src+8 ));
-        AV_WN32(dst+12, AV_RN32(src+12));
-        dst+=dstStride;
-        src+=srcStride;
-    }
-}
-
-static inline void copy_block17(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
-{
-    int i;
-    for(i=0; i<h; i++)
-    {
-        AV_WN32(dst   , AV_RN32(src   ));
-        AV_WN32(dst+4 , AV_RN32(src+4 ));
-        AV_WN32(dst+8 , AV_RN32(src+8 ));
-        AV_WN32(dst+12, AV_RN32(src+12));
-        dst[16]= src[16];
-        dst+=dstStride;
-        src+=srcStride;
-    }
-}
+void ff_h264dsp_init(H264DSPContext *c);
+void ff_h264dsp_init_arm(H264DSPContext *c);
+void ff_h264dsp_init_ppc(H264DSPContext *c);
+void ff_h264dsp_init_x86(H264DSPContext *c);
 
-#endif /* AVCODEC_DSPUTIL_H */
+#endif /* AVCODEC_H264DSP_H */

Modified: trunk/libavcodec/ppc/h264_altivec.c
==============================================================================
--- trunk/libavcodec/ppc/h264_altivec.c	Tue Mar 16 00:40:51 2010	(r22564)
+++ trunk/libavcodec/ppc/h264_altivec.c	Tue Mar 16 02:17:00 2010	(r22565)
@@ -20,6 +20,7 @@
 
 #include "libavcodec/dsputil.h"
 #include "libavcodec/h264data.h"
+#include "libavcodec/h264dsp.h"
 
 #include "dsputil_ppc.h"
 #include "dsputil_altivec.h"
@@ -974,16 +975,6 @@ void dsputil_h264_init_ppc(DSPContext* c
         c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec;
         c->put_no_rnd_vc1_chroma_pixels_tab[0] = put_no_rnd_vc1_chroma_mc8_altivec;
         c->avg_no_rnd_vc1_chroma_pixels_tab[0] = avg_no_rnd_vc1_chroma_mc8_altivec;
-        c->h264_idct_add = ff_h264_idct_add_altivec;
-        c->h264_idct_add8 = ff_h264_idct_add8_altivec;
-        c->h264_idct_add16 = ff_h264_idct_add16_altivec;
-        c->h264_idct_add16intra = ff_h264_idct_add16intra_altivec;
-        c->h264_idct_dc_add= h264_idct_dc_add_altivec;
-        c->h264_idct8_dc_add = ff_h264_idct8_dc_add_altivec;
-        c->h264_idct8_add = ff_h264_idct8_add_altivec;
-        c->h264_idct8_add4 = ff_h264_idct8_add4_altivec;
-        c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
-        c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
 
 #define dspfunc(PFX, IDX, NUM) \
         c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \
@@ -1006,6 +997,22 @@ void dsputil_h264_init_ppc(DSPContext* c
         dspfunc(put_h264_qpel, 0, 16);
         dspfunc(avg_h264_qpel, 0, 16);
 #undef dspfunc
+    }
+}
+
+void ff_h264dsp_init_ppc(H264DSPContext *c)
+{
+    if (has_altivec()) {
+        c->h264_idct_add = ff_h264_idct_add_altivec;
+        c->h264_idct_add8 = ff_h264_idct_add8_altivec;
+        c->h264_idct_add16 = ff_h264_idct_add16_altivec;
+        c->h264_idct_add16intra = ff_h264_idct_add16intra_altivec;
+        c->h264_idct_dc_add= h264_idct_dc_add_altivec;
+        c->h264_idct8_dc_add = ff_h264_idct8_dc_add_altivec;
+        c->h264_idct8_add = ff_h264_idct8_add_altivec;
+        c->h264_idct8_add4 = ff_h264_idct8_add4_altivec;
+        c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
+        c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
 
         c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec;
         c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec;

Modified: trunk/libavcodec/x86/dsputil_mmx.c
==============================================================================
--- trunk/libavcodec/x86/dsputil_mmx.c	Tue Mar 16 00:40:51 2010	(r22564)
+++ trunk/libavcodec/x86/dsputil_mmx.c	Tue Mar 16 02:17:00 2010	(r22565)
@@ -24,6 +24,7 @@
 
 #include "libavutil/x86_cpu.h"
 #include "libavcodec/dsputil.h"
+#include "libavcodec/h264dsp.h"
 #include "libavcodec/mpegvideo.h"
 #include "libavcodec/simple_idct.h"
 #include "dsputil_mmx.h"
@@ -2618,16 +2619,6 @@ void dsputil_init_mmx(DSPContext* c, AVC
         c->put_rv40_chroma_pixels_tab[0]= put_rv40_chroma_mc8_mmx;
         c->put_rv40_chroma_pixels_tab[1]= put_rv40_chroma_mc4_mmx;
 
-        c->h264_idct_dc_add=
-        c->h264_idct_add= ff_h264_idct_add_mmx;
-        c->h264_idct8_dc_add=
-        c->h264_idct8_add= ff_h264_idct8_add_mmx;
-
-        c->h264_idct_add16     = ff_h264_idct_add16_mmx;
-        c->h264_idct8_add4     = ff_h264_idct8_add4_mmx;
-        c->h264_idct_add8      = ff_h264_idct_add8_mmx;
-        c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx;
-
         if (CONFIG_VP6_DECODER) {
             c->vp6_filter_diag4 = ff_vp6_filter_diag4_mmx;
         }
@@ -2649,13 +2640,6 @@ void dsputil_init_mmx(DSPContext* c, AVC
             c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
             c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
 
-            c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
-            c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
-            c->h264_idct_add16     = ff_h264_idct_add16_mmx2;
-            c->h264_idct8_add4     = ff_h264_idct8_add4_mmx2;
-            c->h264_idct_add8      = ff_h264_idct_add8_mmx2;
-            c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
-
             if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
                 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
                 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
@@ -2716,31 +2700,6 @@ void dsputil_init_mmx(DSPContext* c, AVC
             c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
             c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2;
             c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2;
-            c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
-            c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
-            c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
-            c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
-            c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
-            c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
-            c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
-
-            c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
-            c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
-            c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
-            c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
-            c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
-            c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
-            c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
-            c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
-
-            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
-            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
-            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
-            c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
-            c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
-            c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
-            c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
-            c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
 
 #if HAVE_YASM
             c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
@@ -2825,9 +2784,6 @@ void dsputil_init_mmx(DSPContext* c, AVC
             H264_QPEL_FUNCS(0, 0, sse2);
         }
         if(mm_flags & FF_MM_SSE2){
-            c->h264_idct8_add = ff_h264_idct8_add_sse2;
-            c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
-
             H264_QPEL_FUNCS(0, 1, sse2);
             H264_QPEL_FUNCS(0, 2, sse2);
             H264_QPEL_FUNCS(0, 3, sse2);
@@ -2874,26 +2830,6 @@ void dsputil_init_mmx(DSPContext* c, AVC
         }
 #endif
 
-#if CONFIG_GPL && HAVE_YASM
-        if (mm_flags & FF_MM_MMX2){
-#if ARCH_X86_32
-            c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext;
-            c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext;
-#endif
-            if( mm_flags&FF_MM_SSE2 ){
-#if ARCH_X86_64 || !defined(__ICC) || __ICC > 1110
-                c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2;
-                c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2;
-                c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2;
-                c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2;
-#endif
-                c->h264_idct_add16 = ff_h264_idct_add16_sse2;
-                c->h264_idct_add8  = ff_h264_idct_add8_sse2;
-                c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2;
-            }
-        }
-#endif
-
         if(mm_flags & FF_MM_3DNOW){
             c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
             c->vector_fmul = vector_fmul_3dnow;
@@ -2983,3 +2919,81 @@ void dsputil_init_mmx(DSPContext* c, AVC
     //ff_idct = just_return;
 #endif
 }
+
+#if CONFIG_H264DSP
+void ff_h264dsp_init_x86(H264DSPContext *c)
+{
+    mm_flags = mm_support();
+
+    if (mm_flags & FF_MM_MMX) {
+        c->h264_idct_dc_add=
+        c->h264_idct_add= ff_h264_idct_add_mmx;
+        c->h264_idct8_dc_add=
+        c->h264_idct8_add= ff_h264_idct8_add_mmx;
+
+        c->h264_idct_add16     = ff_h264_idct_add16_mmx;
+        c->h264_idct8_add4     = ff_h264_idct8_add4_mmx;
+        c->h264_idct_add8      = ff_h264_idct_add8_mmx;
+        c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx;
+
+        if (mm_flags & FF_MM_MMX2) {
+            c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
+            c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
+            c->h264_idct_add16     = ff_h264_idct_add16_mmx2;
+            c->h264_idct8_add4     = ff_h264_idct8_add4_mmx2;
+            c->h264_idct_add8      = ff_h264_idct_add8_mmx2;
+            c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
+
+            c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
+            c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
+            c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
+            c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
+            c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
+            c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
+            c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
+
+            c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
+            c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
+            c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
+            c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
+            c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
+            c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
+            c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
+            c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
+
+            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
+            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
+            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
+            c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
+            c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
+            c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
+            c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
+            c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
+        }
+        if(mm_flags & FF_MM_SSE2){
+            c->h264_idct8_add = ff_h264_idct8_add_sse2;
+            c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
+        }
+
+#if CONFIG_GPL && HAVE_YASM
+        if (mm_flags & FF_MM_MMX2){
+#if ARCH_X86_32
+            c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext;
+            c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext;
+#endif
+            if( mm_flags&FF_MM_SSE2 ){
+#if ARCH_X86_64 || !defined(__ICC) || __ICC > 1110
+                c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2;
+                c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2;
+                c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2;
+                c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2;
+#endif
+                c->h264_idct_add16 = ff_h264_idct_add16_sse2;
+                c->h264_idct_add8  = ff_h264_idct_add8_sse2;
+                c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2;
+            }
+        }
+#endif
+    }
+}
+#endif /* CONFIG_H264DSP */