[FFmpeg-devel] [PATCH 6/6] x86: hevc_mc: allow some functions for x86_32

Christophe Gisquet christophe.gisquet at gmail.com
Sun Jun 1 16:13:02 CEST 2014


Now that the gpr/xmm register count has decreased, some functions are
usable by x86_32. Around 2% speedup.
---
 libavcodec/x86/hevc_mc.asm    | 831 +++++++++++++++++++++---------------------
 libavcodec/x86/hevcdsp.h      |   3 +
 libavcodec/x86/hevcdsp_init.c |  26 +-
 3 files changed, 441 insertions(+), 419 deletions(-)

diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
index 3ed2662..efde131 100644
--- a/libavcodec/x86/hevc_mc.asm
+++ b/libavcodec/x86/hevc_mc.asm
@@ -72,20 +72,57 @@ QPEL_TABLE 10, 4, w, sse4
 
 %define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
 
-%if ARCH_X86_64
+INIT_XMM sse4
+%macro PEL_10STORE2 3
+    movd           [%1], %2
+%endmacro
+%macro PEL_10STORE4 3
+    movq           [%1], %2
+%endmacro
+%macro PEL_10STORE6 3
+    movq           [%1], %2
+    psrldq            %2, 8
+    movd         [%1+8], %2
+%endmacro
+%macro PEL_10STORE8 3
+    movdqa         [%1], %2
+%endmacro
+%macro PEL_10STORE12 3
+    movdqa         [%1], %2
+    movq        [%1+16], %3
+%endmacro
+%macro PEL_10STORE16 3
+    PEL_10STORE8      %1, %2, %3
+    movdqa       [%1+16], %3
+%endmacro
 
-%macro SIMPLE_BILOAD 4   ;width, tab, r1, r2
-%if %1 <= 4
-    movq              %3, [%2]                                              ; load data from source2
-%elif %1 <= 8
-    movdqa            %3, [%2]                                              ; load data from source2
-%elif %1 <= 12
-    movdqa            %3, [%2]                                              ; load data from source2
-    movq              %4, [%2+16]                                           ; load data from source2
-%else
-    movdqa            %3, [%2]                                              ; load data from source2
-    movdqa            %4, [%2+16]                                           ; load data from source2
-%endif
+%macro PEL_8STORE2 3
+    pextrw          [%1], %2, 0
+%endmacro
+%macro PEL_8STORE4 3
+    movd            [%1], %2
+%endmacro
+%macro PEL_8STORE6 3
+    movd            [%1], %2
+    pextrw        [%1+4], %2, 2
+%endmacro
+%macro PEL_8STORE8 3
+    movq           [%1], %2
+%endmacro
+%macro PEL_8STORE12 3
+    movq            [%1], %2
+    psrldq            %2, 8
+    movd          [%1+8], %2
+%endmacro
+%macro PEL_8STORE16 3
+    movdqa          [%1], %2
+%endmacro
+
+%macro LOOP_END 4
+    lea              %1q, [%1q+2*%2q]            ; dst += dststride
+    lea              %3q, [%3q+  %4q]            ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
 %endmacro
 
 %macro SIMPLE_LOAD 4    ;width, bitd, tab, r1
@@ -98,17 +135,17 @@ QPEL_TABLE 10, 4, w, sse4
 %endif
 %endmacro
 
-%macro SIMPLE_8LOAD 5    ;width, bitd, tab, r1, r2
-%if %1 == 2 || (%2 == 8 && %1 <= 4)
-    movq              %4, [%3]                                              ; load data from source2
-%elif %1 == 4 || (%2 == 8 && %1 <= 8)
-    movdqa            %4, [%3]                                              ; load data from source2
+%macro SIMPLE_BILOAD 4   ;width, tab, r1, r2
+%if %1 <= 4
+    movq              %3, [%2]                                              ; load data from source2
+%elif %1 <= 8
+    movdqa            %3, [%2]                                              ; load data from source2
 %elif %1 <= 12
-    movdqa            %4, [%3]                                              ; load data from source2
-    movq              %5, [%3+16]                                           ; load data from source2
+    movdqa            %3, [%2]                                              ; load data from source2
+    movq              %4, [%2+16]                                           ; load data from source2
 %else
-    movdqa            %4, [%3]                                              ; load data from source2
-    movdqa            %5, [%3+16]                                           ; load data from source2
+    movdqa            %3, [%2]                                              ; load data from source2
+    movdqa            %4, [%2+16]                                           ; load data from source2
 %endif
 %endmacro
 
@@ -125,42 +162,6 @@ QPEL_TABLE 10, 4, w, sse4
     mova              m5, [FILTER+16]     ; get 2 last values of filters
 %endmacro
 
-%macro EPEL_HV_FILTER 1
-%ifdef PIC
-    lea         rfilterq, [hevc_epel_filters_sse4_%1]
-%else
-    %define rfilterq hevc_epel_filters_sse4_%1
-%endif
-    sub              mxq, 1
-    sub              myq, 1
-    shl              mxq, 5                      ; multiply by 32
-    shl              myq, 5                      ; multiply by 32
-    movdqa           m14, [rfilterq + mxq]        ; get 2 first values of filters
-    movdqa           m15, [rfilterq + mxq+16]     ; get 2 last values of filters
-    lea           r3srcq, [srcstrideq*3]
-
-%ifdef PIC
-    lea         rfilterq, [hevc_epel_filters_sse4_10]
-%else
-    %define rfilterq hevc_epel_filters_sse4_10
-%endif
-    movdqa           m12, [rfilterq + myq]        ; get 2 first values of filters
-    movdqa           m13, [rfilterq + myq+16]     ; get 2 last values of filters
-%endmacro
-
-%macro QPEL_FILTER 2
-%ifdef PIC
-    lea         rfilterq, [hevc_qpel_filters_sse4_%1]
-%else
-    %define rfilterq hevc_qpel_filters_sse4_%1
-%endif
-    lea              %2q, [%2q*8-8]
-    movdqa           m12, [rfilterq + %2q*8]       ; get 4 first values of filters
-    movdqa           m13, [rfilterq + %2q*8 + 16]  ; get 4 first values of filters
-    movdqa           m14, [rfilterq + %2q*8 + 32]  ; get 4 first values of filters
-    movdqa           m15, [rfilterq + %2q*8 + 48]  ; get 4 first values of filters
-%endmacro
-
 %macro EPEL_LOAD 4-5
 %if %0 == 5
     %define rfilterq %2
@@ -207,6 +208,125 @@ QPEL_TABLE 10, 4, w, sse4
 %endmacro
 
 
+%macro MC_PIXEL_COMPUTE 2 ;width, bitdepth
+%if %2 == 8
+%if %1 > 8
+    punpckhbw         m1, m0, m2
+    psllw             m1, 14-%2
+%endif
+    punpcklbw         m0, m2
+%endif
+    psllw             m0, 14-%2
+%endmacro
+
+
+%macro EPEL_COMPUTE 4 ; bitdepth, width, filter1, filter2
+%if %1 == 8
+    pmaddubsw         m0, %3   ;x1*c1+x2*c2
+    pmaddubsw         m2, %4   ;x3*c3+x4*c4
+    paddw             m0, m2
+%if %2 > 8
+    pmaddubsw         m1, %3
+    pmaddubsw         m3, %4
+    paddw             m1, m3
+%endif
+%else
+    pmaddwd           m0, %3
+    pmaddwd           m2, %4
+    paddd             m0, m2
+%if %2 > 4
+    pmaddwd           m1, %3
+    pmaddwd           m3, %4
+    paddd             m1, m3
+%endif
+%if %1 != 8
+    psrad             m0, %1-8
+    psrad             m1, %1-8
+%endif
+    packssdw          m0, m1
+%endif
+%endmacro
+
+
+%macro BI_COMPUTE 7     ; width, bitd, src1l, src1h, scr2l, scr2h, pw
+    paddsw            %3, %5
+%if %1 > 8
+    paddsw            %4, %6
+%endif
+    UNI_COMPUTE       %1, %2, %3, %4, %7
+%endmacro
+
+%macro UNI_COMPUTE 5
+    pmulhrsw          %3, %5
+%if %1 > 8 || (%2 > 8 && %1 > 4)
+    pmulhrsw          %4, %5
+%endif
+%if %2 == 8
+    packuswb          %3, %4
+%else
+    pminsw            %3, [max_pixels_%2]
+    pmaxsw            %3, [zero]
+%if %1 > 8
+    pminsw            %4, [max_pixels_%2]
+    pmaxsw            %4, [zero]
+%endif
+%endif
+%endmacro
+
+
+%if ARCH_X86_64
+
+%macro SIMPLE_8LOAD 5    ;width, bitd, tab, r1, r2
+%if %1 == 2 || (%2 == 8 && %1 <= 4)
+    movq              %4, [%3]                                              ; load data from source2
+%elif %1 == 4 || (%2 == 8 && %1 <= 8)
+    movdqa            %4, [%3]                                              ; load data from source2
+%elif %1 <= 12
+    movdqa            %4, [%3]                                              ; load data from source2
+    movq              %5, [%3+16]                                           ; load data from source2
+%else
+    movdqa            %4, [%3]                                              ; load data from source2
+    movdqa            %5, [%3+16]                                           ; load data from source2
+%endif
+%endmacro
+
+%macro EPEL_HV_FILTER 1
+%ifdef PIC
+    lea         rfilterq, [hevc_epel_filters_sse4_%1]
+%else
+    %define rfilterq hevc_epel_filters_sse4_%1
+%endif
+    sub              mxq, 1
+    sub              myq, 1
+    shl              mxq, 5                      ; multiply by 32
+    shl              myq, 5                      ; multiply by 32
+    movdqa           m14, [rfilterq + mxq]        ; get 2 first values of filters
+    movdqa           m15, [rfilterq + mxq+16]     ; get 2 last values of filters
+    lea           r3srcq, [srcstrideq*3]
+
+%ifdef PIC
+    lea         rfilterq, [hevc_epel_filters_sse4_10]
+%else
+    %define rfilterq hevc_epel_filters_sse4_10
+%endif
+    movdqa           m12, [rfilterq + myq]        ; get 2 first values of filters
+    movdqa           m13, [rfilterq + myq+16]     ; get 2 last values of filters
+%endmacro
+
+%macro QPEL_FILTER 2
+%ifdef PIC
+    lea         rfilterq, [hevc_qpel_filters_sse4_%1]
+%else
+    %define rfilterq hevc_qpel_filters_sse4_%1
+%endif
+    lea              %2q, [%2q*8-8]
+    movdqa           m12, [rfilterq + %2q*8]       ; get 4 first values of filters
+    movdqa           m13, [rfilterq + %2q*8 + 16]  ; get 4 first values of filters
+    movdqa           m14, [rfilterq + %2q*8 + 32]  ; get 4 first values of filters
+    movdqa           m15, [rfilterq + %2q*8 + 48]  ; get 4 first values of filters
+%endmacro
+
+
 %macro QPEL_H_LOAD 4
 %assign %%stride (%1+7)/8
 %if %1 == 8
@@ -300,97 +420,6 @@ QPEL_TABLE 10, 4, w, sse4
 %endif
 %endmacro
 
-%macro PEL_10STORE2 3
-    movd           [%1], %2
-%endmacro
-%macro PEL_10STORE4 3
-    movq           [%1], %2
-%endmacro
-%macro PEL_10STORE6 3
-    movq           [%1], %2
-    psrldq            %2, 8
-    movd         [%1+8], %2
-%endmacro
-%macro PEL_10STORE8 3
-    movdqa         [%1], %2
-%endmacro
-%macro PEL_10STORE12 3
-    movdqa         [%1], %2
-    movq        [%1+16], %3
-%endmacro
-%macro PEL_10STORE16 3
-    PEL_10STORE8      %1, %2, %3
-    movdqa       [%1+16], %3
-%endmacro
-
-%macro PEL_8STORE2 3
-    pextrw          [%1], %2, 0
-%endmacro
-%macro PEL_8STORE4 3
-    movd            [%1], %2
-%endmacro
-%macro PEL_8STORE6 3
-    movd            [%1], %2
-    pextrw        [%1+4], %2, 2
-%endmacro
-%macro PEL_8STORE8 3
-    movq           [%1], %2
-%endmacro
-%macro PEL_8STORE12 3
-    movq            [%1], %2
-    psrldq            %2, 8
-    movd          [%1+8], %2
-%endmacro
-%macro PEL_8STORE16 3
-    movdqa          [%1], %2
-%endmacro
-
-%macro LOOP_END 4
-    lea              %1q, [%1q+2*%2q]            ; dst += dststride
-    lea              %3q, [%3q+  %4q]            ; src += srcstride
-    dec          heightd                         ; cmp height
-    jnz               .loop                      ; height loop
-%endmacro
-
-
-%macro MC_PIXEL_COMPUTE 2 ;width, bitdepth
-%if %2 == 8
-%if %1 > 8
-    punpckhbw         m1, m0, m2
-    psllw             m1, 14-%2
-%endif
-    punpcklbw         m0, m2
-%endif
-    psllw             m0, 14-%2
-%endmacro
-
-
-%macro EPEL_COMPUTE 4 ; bitdepth, width, filter1, filter2
-%if %1 == 8
-    pmaddubsw         m0, %3   ;x1*c1+x2*c2
-    pmaddubsw         m2, %4   ;x3*c3+x4*c4
-    paddw             m0, m2
-%if %2 > 8
-    pmaddubsw         m1, %3
-    pmaddubsw         m3, %4
-    paddw             m1, m3
-%endif
-%else
-    pmaddwd           m0, %3
-    pmaddwd           m2, %4
-    paddd             m0, m2
-%if %2 > 4
-    pmaddwd           m1, %3
-    pmaddwd           m3, %4
-    paddd             m1, m3
-%endif
-%if %1 != 8
-    psrad             m0, %1-8
-    psrad             m1, %1-8
-%endif
-    packssdw          m0, m1
-%endif
-%endmacro
 
 %macro QPEL_HV_COMPUTE 4     ; width, bitdepth, filter idx
 %ifdef PIC
@@ -423,238 +452,59 @@ QPEL_TABLE 10, 4, w, sse4
     pmaddwd           m3, [rfilterq + %3q*8+16]
     pmaddwd           m5, [rfilterq + %3q*8+32]
     pmaddwd           m7, [rfilterq + %3q*8+48]
-    paddd             m1, m3
-    paddd             m5, m7
-    paddd             m1, m5
-%if %2 != 8
-    psrad             m1, %2-8
-%endif
-%endif
-    p%4               m0, m1
-%endif
-%endmacro
-
-%macro QPEL_COMPUTE 2     ; width, bitdepth
-%if %2 == 8
-    pmaddubsw         m0, m12   ;x1*c1+x2*c2
-    pmaddubsw         m2, m13   ;x3*c3+x4*c4
-    pmaddubsw         m4, m14   ;x5*c5+x6*c6
-    pmaddubsw         m6, m15   ;x7*c7+x8*c8
-    paddw             m0, m2
-    paddw             m4, m6
-    paddw             m0, m4
-%if %1 > 8
-    pmaddubsw         m1, m12
-    pmaddubsw         m3, m13
-    pmaddubsw         m5, m14
-    pmaddubsw         m7, m15
-    paddw             m1, m3
-    paddw             m5, m7
-    paddw             m1, m5
-%endif
-%else
-    pmaddwd           m0, m12
-    pmaddwd           m2, m13
-    pmaddwd           m4, m14
-    pmaddwd           m6, m15
-    paddd             m0, m2
-    paddd             m4, m6
-    paddd             m0, m4
-%if %2 != 8
-    psrad             m0, %2-8
-%endif
-%if %1 > 4
-    pmaddwd           m1, m12
-    pmaddwd           m3, m13
-    pmaddwd           m5, m14
-    pmaddwd           m7, m15
-    paddd             m1, m3
-    paddd             m5, m7
-    paddd             m1, m5
-%if %2 != 8
-    psrad             m1, %2-8
-%endif
-%endif
-%endif
-%endmacro
-
-%macro BI_COMPUTE 7     ; width, bitd, src1l, src1h, scr2l, scr2h, pw
-    paddsw            %3, %5
-%if %1 > 8
-    paddsw            %4, %6
-%endif
-    UNI_COMPUTE       %1, %2, %3, %4, %7
-%endmacro
-
-%macro UNI_COMPUTE 5
-    pmulhrsw          %3, %5
-%if %1 > 8 || (%2 > 8 && %1 > 4)
-    pmulhrsw          %4, %5
-%endif
-%if %2 == 8
-    packuswb          %3, %4
-%else
-    pminsw            %3, [max_pixels_%2]
-    pmaxsw            %3, [zero]
-%if %1 > 8
-    pminsw            %4, [max_pixels_%2]
-    pmaxsw            %4, [zero]
-%endif
-%endif
-%endmacro
-
-INIT_XMM sse4                                    ; adds ff_ and _sse4 to function name
-; ******************************
-; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
-;                         uint8_t *_src, ptrdiff_t _srcstride,
-;                         int height, int mx, int my)
-; ******************************
-
-%macro HEVC_PUT_HEVC_PEL_PIXELS 2
-cglobal hevc_put_hevc_pel_pixels%1_%2, 5, 5, 3, dst, dststride, src, srcstride,height
-    pxor               m2, m2
-.loop
-    SIMPLE_LOAD       %1, %2, srcq, m0
-    MC_PIXEL_COMPUTE  %1, %2
-    PEL_10STORE%1     dstq, m0, m1
-    LOOP_END         dst, dststride, src, srcstride
-    RET
-
-cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 3, dst, dststride, src, srcstride,height
-    pxor              m2, m2
-.loop
-    SIMPLE_LOAD       %1, %2, srcq, m0
-    PEL_%2STORE%1   dstq, m0, m1
-    lea             dstq, [dstq+dststrideq]      ; dst += dststride
-    lea             srcq, [srcq+srcstrideq]      ; src += srcstride
-    dec          heightd                         ; cmp height
-    jnz               .loop                      ; height loop
-    RET
-
-cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 7, 7, 6, dst, dststride, src, srcstride, src2, src2stride,height
-    pxor              m2, m2
-    movdqa            m5, [pw_bi_%2]
-.loop
-    SIMPLE_LOAD       %1, %2, srcq, m0
-    SIMPLE_BILOAD     %1, src2q, m3, m4
-    MC_PIXEL_COMPUTE  %1, %2
-    BI_COMPUTE        %1, %2, m0, m1, m3, m4, m5
-    PEL_%2STORE%1   dstq, m0, m1
-    lea             dstq, [dstq+dststrideq]      ; dst += dststride
-    lea             srcq, [srcq+srcstrideq]      ; src += srcstride
-    lea            src2q, [src2q+2*src2strideq]  ; src += srcstride
-    dec          heightd                         ; cmp height
-    jnz               .loop                      ; height loop
-    RET
-
-%endmacro
-
-
-; ******************************
-; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride,
-;                       uint8_t *_src, ptrdiff_t _srcstride,
-;                       int width, int height, int mx, int my,
-;                       int16_t* mcbuffer)
-; ******************************
-
-
-%macro HEVC_PUT_HEVC_EPEL 2
-cglobal hevc_put_hevc_epel_h%1_%2, 6, 6, 6, dst, dststride, src, srcstride, height, mx
-%assign %%stride ((%2 + 7)/8)
-    EPEL_FILTER       %2, mx
-.loop
-    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1, 1
-    EPEL_COMPUTE      %2, %1, m4, m5
-    PEL_10STORE%1      dstq, m0, m1
-    LOOP_END         dst, dststride, src, srcstride
-    RET
-
-cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 6, 7, dst, dststride, src, srcstride, height, mx
-%assign %%stride ((%2 + 7)/8)
-    movdqa            m6, [pw_%2]
-    EPEL_FILTER       %2, mx
-.loop
-    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1, 1
-    EPEL_COMPUTE      %2, %1, m4, m5
-    UNI_COMPUTE       %1, %2, m0, m1, m6
-    PEL_%2STORE%1   dstq, m0, m1
-    lea             dstq, [dstq+dststrideq]      ; dst += dststride
-    lea             srcq, [srcq+srcstrideq]      ; src += srcstride
-    dec          heightd                         ; cmp height
-    jnz               .loop                      ; height loop
-    RET
-
-cglobal hevc_put_hevc_bi_epel_h%1_%2, 6, 7, 7, dst, dststride, src, srcstride, src2, src2stride,height, mx
-    mov          heightd, mxm
-    movdqa            m6, [pw_bi_%2]
-    EPEL_FILTER       %2, height
-    mov          heightd, heightm
-.loop
-    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1, 1
-    EPEL_COMPUTE      %2, %1, m4, m5
-    SIMPLE_BILOAD     %1, src2q, m2, m3
-    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6
-    PEL_%2STORE%1   dstq, m0, m1
-    lea             dstq, [dstq+dststrideq]      ; dst += dststride
-    lea             srcq, [srcq+srcstrideq]      ; src += srcstride
-    lea            src2q, [src2q+2*src2strideq]  ; src += srcstride
-    dec          heightd                         ; cmp height
-    jnz               .loop                      ; height loop
-    RET
-
-; ******************************
-; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
-;                      uint8_t *_src, ptrdiff_t _srcstride,
-;                      int width, int height, int mx, int my,
-;                      int16_t* mcbuffer)
-; ******************************
-
-cglobal hevc_put_hevc_epel_v%1_%2, 5, 6, 6, dst, dststride, src, srcstride, height, r3src, my
-    mov              r5d, mym
-    EPEL_FILTER       %2, r5
-    sub             srcq, srcstrideq
-.loop
-    EPEL_LOAD         %2, srcq, srcstride, %1, 1
-    EPEL_COMPUTE      %2, %1, m4, m5
-    PEL_10STORE%1     dstq, m0, m1
-    lea             dstq, [dstq+2*dststrideq]
-    dec          heightd                         ; cmp height
-    jnz               .loop                      ; height loop
-    RET
-
-cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 6, 7, dst, dststride, src, srcstride, height, r3src, my
-    mov              r5d, mym
-    EPEL_FILTER       %2, r5
-    movdqa            m6, [pw_%2]
-    sub             srcq, srcstrideq
-.loop
-    EPEL_LOAD         %2, srcq, srcstride, %1, 1
-    EPEL_COMPUTE      %2, %1, m4, m5
-    UNI_COMPUTE       %1, %2, m0, m1, m6
-    PEL_%2STORE%1   dstq, m0, m1
-    lea             dstq, [dstq+dststrideq]      ; dst += dststride
-    dec          heightd                         ; cmp height
-    jnz               .loop                      ; height loop
-    RET
-
-
-cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 7, 7, dst, dststride, src, srcstride, src2, src2stride,height, r3src, my
-    mov          heightd, mym
-    EPEL_FILTER       %2, height
-    movdqa            m6, [pw_bi_%2]
-    mov          heightd, heightm
-    sub             srcq, srcstrideq
-.loop
-    EPEL_LOAD         %2, srcq, srcstride, %1, 1
-    EPEL_COMPUTE      %2, %1, m4, m5
-    SIMPLE_BILOAD     %1, src2q, m2, m3
-    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6
-    PEL_%2STORE%1   dstq, m0, m1
-    lea             dstq, [dstq+dststrideq]      ; dst += dststride
-    lea            src2q, [src2q+2*src2strideq]  ; src += srcstride
-    dec          heightd                         ; cmp height
-    jnz               .loop                      ; height loop
-    RET
+    paddd             m1, m3
+    paddd             m5, m7
+    paddd             m1, m5
+%if %2 != 8
+    psrad             m1, %2-8
+%endif
+%endif
+    p%4               m0, m1
+%endif
+%endmacro
+
+%macro QPEL_COMPUTE 2     ; width, bitdepth
+%if %2 == 8
+    pmaddubsw         m0, m12   ;x1*c1+x2*c2
+    pmaddubsw         m2, m13   ;x3*c3+x4*c4
+    pmaddubsw         m4, m14   ;x5*c5+x6*c6
+    pmaddubsw         m6, m15   ;x7*c7+x8*c8
+    paddw             m0, m2
+    paddw             m4, m6
+    paddw             m0, m4
+%if %1 > 8
+    pmaddubsw         m1, m12
+    pmaddubsw         m3, m13
+    pmaddubsw         m5, m14
+    pmaddubsw         m7, m15
+    paddw             m1, m3
+    paddw             m5, m7
+    paddw             m1, m5
+%endif
+%else
+    pmaddwd           m0, m12
+    pmaddwd           m2, m13
+    pmaddwd           m4, m14
+    pmaddwd           m6, m15
+    paddd             m0, m2
+    paddd             m4, m6
+    paddd             m0, m4
+%if %2 != 8
+    psrad             m0, %2-8
+%endif
+%if %1 > 4
+    pmaddwd           m1, m12
+    pmaddwd           m3, m13
+    pmaddwd           m5, m14
+    pmaddwd           m7, m15
+    paddd             m1, m3
+    paddd             m5, m7
+    paddd             m1, m5
+%if %2 != 8
+    psrad             m1, %2-8
+%endif
+%endif
+%endif
 %endmacro
 
 
@@ -1120,6 +970,37 @@ cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 9, 11, 16, dst, dststride, src, srcstride
     RET
 %endmacro
 
+HEVC_PUT_HEVC_EPEL_HV 2,  8
+HEVC_PUT_HEVC_EPEL_HV 4,  8
+HEVC_PUT_HEVC_EPEL_HV 6,  8
+HEVC_PUT_HEVC_EPEL_HV 8,  8
+
+HEVC_PUT_HEVC_EPEL_HV 2, 10
+HEVC_PUT_HEVC_EPEL_HV 4, 10
+HEVC_PUT_HEVC_EPEL_HV 6, 10
+HEVC_PUT_HEVC_EPEL_HV 8, 10
+
+
+HEVC_PUT_HEVC_QPEL 4,  8
+HEVC_PUT_HEVC_QPEL 8,  8
+HEVC_PUT_HEVC_QPEL 12, 8
+HEVC_PUT_HEVC_QPEL 16, 8
+
+HEVC_PUT_HEVC_QPEL 4, 10
+HEVC_PUT_HEVC_QPEL 8, 10
+
+HEVC_PUT_HEVC_QPEL_HV 2, 8
+HEVC_PUT_HEVC_QPEL_HV 4, 8
+HEVC_PUT_HEVC_QPEL_HV 6, 8
+HEVC_PUT_HEVC_QPEL_HV 8, 8
+
+HEVC_PUT_HEVC_QPEL_HV 2, 10
+HEVC_PUT_HEVC_QPEL_HV 4, 10
+HEVC_PUT_HEVC_QPEL_HV 6, 10
+HEVC_PUT_HEVC_QPEL_HV 8, 10
+
+%endif ; ARCH_X86_64
+
 %macro WEIGHTING_FUNCS 2
 %if WIN64 || ARCH_X86_32
 cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, srcstride, height, denom, wx, ox
@@ -1172,6 +1053,7 @@ cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, srcstride, heigh
     jnz               .loop                      ; height loop
     RET
 
+%if ARCH_X86_64
 cglobal hevc_put_hevc_bi_w%1_%2, 6, 7, 10, dst, dststride, src, srcstride, src2, src2stride, height, denom, wx0, wx1, ox0, ox1
     mov              r6d, denomm
     movd              m2, wx0m         ; WX0
@@ -1225,8 +1107,11 @@ cglobal hevc_put_hevc_bi_w%1_%2, 6, 7, 10, dst, dststride, src, srcstride, src2,
     dec              r6d                         ; cmp height
     jnz               .loop                      ; height loop
     RET
+%endif ; ~ARCH_X86_64
+
 %endmacro
 
+INIT_XMM sse4
 WEIGHTING_FUNCS 2, 8
 WEIGHTING_FUNCS 4, 8
 WEIGHTING_FUNCS 6, 8
@@ -1237,6 +1122,52 @@ WEIGHTING_FUNCS 4, 10
 WEIGHTING_FUNCS 6, 10
 WEIGHTING_FUNCS 8, 10
 
+
+; ******************************
+; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
+;                         uint8_t *_src, ptrdiff_t _srcstride,
+;                         int height, int mx, int my)
+; ******************************
+%macro HEVC_PUT_HEVC_PEL_PIXELS 2
+cglobal hevc_put_hevc_pel_pixels%1_%2, 5, 5, 3, dst, dststride, src, srcstride,height
+    pxor               m2, m2
+.loop
+    SIMPLE_LOAD       %1, %2, srcq, m0
+    MC_PIXEL_COMPUTE  %1, %2
+    PEL_10STORE%1     dstq, m0, m1
+    LOOP_END         dst, dststride, src, srcstride
+    RET
+
+cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 3, dst, dststride, src, srcstride,height
+    pxor              m2, m2
+.loop
+    SIMPLE_LOAD       %1, %2, srcq, m0
+    PEL_%2STORE%1   dstq, m0, m1
+    lea             dstq, [dstq+dststrideq]      ; dst += dststride
+    lea             srcq, [srcq+srcstrideq]      ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 7, 7, 6, dst, dststride, src, srcstride, src2, src2stride,height
+    pxor              m2, m2
+    movdqa            m5, [pw_bi_%2]
+.loop
+    SIMPLE_LOAD       %1, %2, srcq, m0
+    SIMPLE_BILOAD     %1, src2q, m3, m4
+    MC_PIXEL_COMPUTE  %1, %2
+    BI_COMPUTE        %1, %2, m0, m1, m3, m4, m5
+    PEL_%2STORE%1   dstq, m0, m1
+    lea             dstq, [dstq+dststrideq]      ; dst += dststride
+    lea             srcq, [srcq+srcstrideq]      ; src += srcstride
+    lea            src2q, [src2q+2*src2strideq]  ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+%endmacro
+
+INIT_XMM sse4
 HEVC_PUT_HEVC_PEL_PIXELS  2, 8
 HEVC_PUT_HEVC_PEL_PIXELS  4, 8
 HEVC_PUT_HEVC_PEL_PIXELS  6, 8
@@ -1250,6 +1181,112 @@ HEVC_PUT_HEVC_PEL_PIXELS 6, 10
 HEVC_PUT_HEVC_PEL_PIXELS 8, 10
 
 
+; ******************************
+; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int width, int height, int mx, int my,
+;                       int16_t* mcbuffer)
+; ******************************
+%macro HEVC_PUT_HEVC_EPEL 2
+cglobal hevc_put_hevc_epel_h%1_%2, 6, 6, 7, dst, dststride, src, srcstride, height, mx
+%assign %%stride ((%2 + 7)/8)
+    EPEL_FILTER       %2, mx
+.loop
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1, 6
+    EPEL_COMPUTE      %2, %1, m4, m5
+    PEL_10STORE%1      dstq, m0, m1
+    LOOP_END         dst, dststride, src, srcstride
+    RET
+
+cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 6, 8, dst, dststride, src, srcstride, height, mx
+%assign %%stride ((%2 + 7)/8)
+    movdqa            m6, [pw_%2]
+    EPEL_FILTER       %2, mx
+.loop
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1, 7
+    EPEL_COMPUTE      %2, %1, m4, m5
+    UNI_COMPUTE       %1, %2, m0, m1, m6
+    PEL_%2STORE%1   dstq, m0, m1
+    lea             dstq, [dstq+dststrideq]      ; dst += dststride
+    lea             srcq, [srcq+srcstrideq]      ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+cglobal hevc_put_hevc_bi_epel_h%1_%2, 6, 7, 8, dst, dststride, src, srcstride, src2, src2stride,height, mx
+    mov          heightd, mxm
+    movdqa            m6, [pw_bi_%2]
+    EPEL_FILTER       %2, height
+    mov          heightd, heightm
+.loop
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1, 7
+    EPEL_COMPUTE      %2, %1, m4, m5
+    SIMPLE_BILOAD     %1, src2q, m2, m3
+    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6
+    PEL_%2STORE%1   dstq, m0, m1
+    lea             dstq, [dstq+dststrideq]      ; dst += dststride
+    lea             srcq, [srcq+srcstrideq]      ; src += srcstride
+    lea            src2q, [src2q+2*src2strideq]  ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+; ******************************
+; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
+;                      uint8_t *_src, ptrdiff_t _srcstride,
+;                      int width, int height, int mx, int my,
+;                      int16_t* mcbuffer)
+; ******************************
+
+cglobal hevc_put_hevc_epel_v%1_%2, 5, 6, 7, dst, dststride, src, srcstride, height, r3src, my
+    mov              r5d, mym
+    EPEL_FILTER       %2, r5
+    sub             srcq, srcstrideq
+.loop
+    EPEL_LOAD         %2, srcq, srcstride, %1, 6
+    EPEL_COMPUTE      %2, %1, m4, m5
+    PEL_10STORE%1     dstq, m0, m1
+    lea             dstq, [dstq+2*dststrideq]
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 6, 8, dst, dststride, src, srcstride, height, r3src, my
+    mov              r5d, mym
+    EPEL_FILTER       %2, r5
+    movdqa            m6, [pw_%2]
+    sub             srcq, srcstrideq
+.loop
+    EPEL_LOAD         %2, srcq, srcstride, %1, 7
+    EPEL_COMPUTE      %2, %1, m4, m5
+    UNI_COMPUTE       %1, %2, m0, m1, m6
+    PEL_%2STORE%1   dstq, m0, m1
+    lea             dstq, [dstq+dststrideq]      ; dst += dststride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+
+cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 7, 8, dst, dststride, src, srcstride, src2, src2stride,height, r3src, my
+    mov          heightd, mym
+    EPEL_FILTER       %2, height
+    movdqa            m6, [pw_bi_%2]
+    mov          heightd, heightm
+    sub             srcq, srcstrideq
+.loop
+    EPEL_LOAD         %2, srcq, srcstride, %1, 7
+    EPEL_COMPUTE      %2, %1, m4, m5
+    SIMPLE_BILOAD     %1, src2q, m2, m3
+    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6
+    PEL_%2STORE%1   dstq, m0, m1
+    lea             dstq, [dstq+dststrideq]      ; dst += dststride
+    lea            src2q, [src2q+2*src2strideq]  ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+%endmacro
+
+INIT_XMM sse4
 HEVC_PUT_HEVC_EPEL 2,  8
 HEVC_PUT_HEVC_EPEL 4,  8
 HEVC_PUT_HEVC_EPEL 6,  8
@@ -1262,35 +1299,3 @@ HEVC_PUT_HEVC_EPEL 2, 10
 HEVC_PUT_HEVC_EPEL 4, 10
 HEVC_PUT_HEVC_EPEL 6, 10
 HEVC_PUT_HEVC_EPEL 8, 10
-
-
-HEVC_PUT_HEVC_EPEL_HV 2,  8
-HEVC_PUT_HEVC_EPEL_HV 4,  8
-HEVC_PUT_HEVC_EPEL_HV 6,  8
-HEVC_PUT_HEVC_EPEL_HV 8,  8
-
-HEVC_PUT_HEVC_EPEL_HV 2, 10
-HEVC_PUT_HEVC_EPEL_HV 4, 10
-HEVC_PUT_HEVC_EPEL_HV 6, 10
-HEVC_PUT_HEVC_EPEL_HV 8, 10
-
-
-HEVC_PUT_HEVC_QPEL 4,  8
-HEVC_PUT_HEVC_QPEL 8,  8
-HEVC_PUT_HEVC_QPEL 12, 8
-HEVC_PUT_HEVC_QPEL 16, 8
-
-HEVC_PUT_HEVC_QPEL 4, 10
-HEVC_PUT_HEVC_QPEL 8, 10
-
-HEVC_PUT_HEVC_QPEL_HV 2, 8
-HEVC_PUT_HEVC_QPEL_HV 4, 8
-HEVC_PUT_HEVC_QPEL_HV 6, 8
-HEVC_PUT_HEVC_QPEL_HV 8, 8
-
-HEVC_PUT_HEVC_QPEL_HV 2, 10
-HEVC_PUT_HEVC_QPEL_HV 4, 10
-HEVC_PUT_HEVC_QPEL_HV 6, 10
-HEVC_PUT_HEVC_QPEL_HV 8, 10
-
-%endif ; ARCH_X86_64
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
index c5a64c7..aca3754 100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -33,6 +33,7 @@ dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
 dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
 dst ## _uni[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt; \
 dst ## _uni_w[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt; \
+if(ARCH_X86_64) \
 dst ## _bi_w[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt
 
 
@@ -100,6 +101,7 @@ EPEL_PROTOTYPES(epel_h , 10, sse4);
 EPEL_PROTOTYPES(epel_v ,  8, sse4);
 EPEL_PROTOTYPES(epel_v , 10, sse4);
 
+#if ARCH_X86_64
 EPEL_PROTOTYPES(epel_hv ,  8, sse4);
 EPEL_PROTOTYPES(epel_hv , 10, sse4);
 
@@ -114,6 +116,7 @@ QPEL_PROTOTYPES(qpel_v, 10, sse4);
 
 QPEL_PROTOTYPES(qpel_hv,  8, sse4);
 QPEL_PROTOTYPES(qpel_hv, 10, sse4);
+#endif
 
 
 WEIGHTING_PROTOTYPES(8, sse4);
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 30902be..44855d1 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -103,7 +103,7 @@ void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dst
     mc_rep_bi_func(name, bitd, step, W, opt)
 
 
-#if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
+#if HAVE_SSE4_EXTERNAL
 
 mc_rep_funcs(pel_pixels, 8, 16, 64, sse4);
 mc_rep_funcs(pel_pixels, 8, 16, 48, sse4);
@@ -137,6 +137,7 @@ mc_rep_funcs(epel_v,10,  8, 32, sse4);
 mc_rep_funcs(epel_v,10,  8, 24, sse4);
 mc_rep_funcs(epel_v,10,  8, 16, sse4);
 mc_rep_funcs(epel_v,10,  4, 12, sse4);
+# if ARCH_X86_64
 mc_rep_funcs(epel_hv, 8,  8, 64, sse4);
 mc_rep_funcs(epel_hv, 8,  8, 48, sse4);
 mc_rep_funcs(epel_hv, 8,  8, 32, sse4);
@@ -182,6 +183,7 @@ mc_rep_funcs(qpel_hv,10,  8, 32, sse4);
 mc_rep_funcs(qpel_hv,10,  8, 24, sse4);
 mc_rep_funcs(qpel_hv,10,  8, 16, sse4);
 mc_rep_funcs(qpel_hv,10,  4, 12, sse4);
+# endif
 
 #define mc_rep_uni_w(bitd, step, W, opt) \
 void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride,\
@@ -212,6 +214,7 @@ mc_rep_uni_w(10, 8, 32, sse4);
 mc_rep_uni_w(10, 8, 48, sse4);
 mc_rep_uni_w(10, 8, 64, sse4);
 
+# if ARCH_X86_64
 #define mc_rep_bi_w(bitd, step, W, opt) \
 void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride, \
                                               int16_t *_src2, ptrdiff_t _src2stride, int height,                        \
@@ -243,6 +246,7 @@ mc_rep_bi_w(10, 8, 24, sse4);
 mc_rep_bi_w(10, 8, 32, sse4);
 mc_rep_bi_w(10, 8, 48, sse4);
 mc_rep_bi_w(10, 8, 64, sse4);
+# endif
 
 #define mc_uni_w_func(name, bitd, W, opt) \
 void ff_hevc_put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride,         \
@@ -272,11 +276,13 @@ mc_uni_w_funcs(epel_h, 8, sse4);
 mc_uni_w_func(epel_h, 8, 6, sse4);
 mc_uni_w_funcs(epel_v, 8, sse4);
 mc_uni_w_func(epel_v, 8, 6, sse4);
+# if ARCH_X86_64
 mc_uni_w_funcs(epel_hv, 8, sse4);
 mc_uni_w_func(epel_hv, 8, 6, sse4);
 mc_uni_w_funcs(qpel_h, 8, sse4);
 mc_uni_w_funcs(qpel_v, 8, sse4);
 mc_uni_w_funcs(qpel_hv, 8, sse4);
+# endif
 
 mc_uni_w_funcs(pel_pixels, 10, sse4);
 mc_uni_w_func(pel_pixels, 10, 6, sse4);
@@ -284,6 +290,7 @@ mc_uni_w_funcs(epel_h, 10, sse4);
 mc_uni_w_func(epel_h, 10, 6, sse4);
 mc_uni_w_funcs(epel_v, 10, sse4);
 mc_uni_w_func(epel_v, 10, 6, sse4);
+# if ARCH_X86_64
 mc_uni_w_funcs(epel_hv, 10, sse4);
 mc_uni_w_func(epel_hv, 10, 6, sse4);
 mc_uni_w_funcs(qpel_h, 10, sse4);
@@ -338,11 +345,12 @@ mc_bi_w_func(epel_hv, 10, 6, sse4);
 mc_bi_w_funcs(qpel_h, 10, sse4);
 mc_bi_w_funcs(qpel_v, 10, sse4);
 mc_bi_w_funcs(qpel_hv, 10, sse4);
+# endif
 
-#endif //ARCH_X86_64 && HAVE_SSE4_EXTERNAL
+#endif //HAVE_SSE4_EXTERNAL
 
 
-#define EPEL_LINKS(pointer, my, mx, fname, bitd, opt )           \
+#define EPEL_LINKS(pointer, my, mx, fname, bitd, opt )          \
         PEL_LINK(pointer, 1, my , mx , fname##4 ,  bitd, opt ); \
         PEL_LINK(pointer, 2, my , mx , fname##6 ,  bitd, opt ); \
         PEL_LINK(pointer, 3, my , mx , fname##8 ,  bitd, opt ); \
@@ -353,6 +361,7 @@ mc_bi_w_funcs(qpel_hv, 10, sse4);
         PEL_LINK(pointer, 8, my , mx , fname##48,  bitd, opt ); \
         PEL_LINK(pointer, 9, my , mx , fname##64,  bitd, opt )
 #define QPEL_LINKS(pointer, my, mx, fname, bitd, opt)           \
+        if(ARCH_X86_64) {                                       \
         PEL_LINK(pointer, 1, my , mx , fname##4 ,  bitd, opt ); \
         PEL_LINK(pointer, 3, my , mx , fname##8 ,  bitd, opt ); \
         PEL_LINK(pointer, 4, my , mx , fname##12,  bitd, opt ); \
@@ -360,7 +369,8 @@ mc_bi_w_funcs(qpel_hv, 10, sse4);
         PEL_LINK(pointer, 6, my , mx , fname##24,  bitd, opt ); \
         PEL_LINK(pointer, 7, my , mx , fname##32,  bitd, opt ); \
         PEL_LINK(pointer, 8, my , mx , fname##48,  bitd, opt ); \
-        PEL_LINK(pointer, 9, my , mx , fname##64,  bitd, opt )
+        PEL_LINK(pointer, 9, my , mx , fname##64,  bitd, opt ); \
+        }
 
 
 void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
@@ -376,17 +386,19 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
                     c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
                     c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
         }
-        if (EXTERNAL_SSE4(mm_flags) && ARCH_X86_64) {
+        if (EXTERNAL_SSE4(mm_flags)) {
 
             EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels,  8, sse4);
             EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,      8, sse4);
             EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,      8, sse4);
+#if ARCH_X86_64
             EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,     8, sse4);
 
             QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4);
             QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     8, sse4);
             QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     8, sse4);
             QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    8, sse4);
+#endif
 
         }
     } else if (bit_depth == 10) {
@@ -398,17 +410,19 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
                     c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
                     c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
         }
-        if (EXTERNAL_SSE4(mm_flags) && ARCH_X86_64) {
+        if (EXTERNAL_SSE4(mm_flags)) {
 
             EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4);
             EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     10, sse4);
             EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     10, sse4);
+#if ARCH_X86_64
             EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    10, sse4);
 
             QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4);
             QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     10, sse4);
             QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     10, sse4);
             QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    10, sse4);
+#endif
         }
     }
 }
-- 
1.8.0.msysgit.0



More information about the ffmpeg-devel mailing list