[FFmpeg-devel] [PATCH 4/5] x86: hevc_mc: convert to ssse3

Christophe Gisquet christophe.gisquet at gmail.com
Sun Aug 24 10:46:33 CEST 2014


The only sse4 instruction is pextrw, which is used on rather minor
functions for small blocks. Therefore use whichever GPR is available
to extract the output word.

Before (sse4), for block_w == 6:
4627 decicycles in epel_uni, 16377 runs, 7 skips
7422 decicycles in epel_bi, 65501 runs, 35 skips

After:
4649 decicycles in epel_uni, 16371 runs, 13 skips
7432 decicycles in epel_bi, 65505 runs, 31 skips
---
 libavcodec/x86/hevc_mc.asm    |  80 +++++--
 libavcodec/x86/hevcdsp.h      |  48 ++--
 libavcodec/x86/hevcdsp_init.c | 522 +++++++++++++++++++++---------------------
 3 files changed, 338 insertions(+), 312 deletions(-)

diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
index 9ce6bd1..52cc66e 100644
--- a/libavcodec/x86/hevc_mc.asm
+++ b/libavcodec/x86/hevc_mc.asm
@@ -31,6 +31,8 @@ max_pixels_10:          times 8  dw ((1 << 10)-1)
 max_pixels_12:          times 8  dw ((1 << 12)-1)
 zero:                   times 4  dd 0
 one_per_32:             times 4  dd 1
+pd_8000:                times 4 dd 0x8000
+pw_8000:                times 8 dw 0x8000
 
 SECTION .text
 %macro EPEL_TABLE 4
@@ -52,9 +54,9 @@ hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
 
 
 
-EPEL_TABLE  8, 8, b, sse4
-EPEL_TABLE 10, 4, w, sse4
-EPEL_TABLE 12, 4, w, sse4
+EPEL_TABLE  8, 8, b, ssse3
+EPEL_TABLE 10, 4, w, ssse3
+EPEL_TABLE 12, 4, w, ssse3
 
 %macro QPEL_TABLE 4
 hevc_qpel_filters_%4_%1 times %2 d%3  -1,  4
@@ -71,13 +73,13 @@ hevc_qpel_filters_%4_%1 times %2 d%3  -1,  4
                         times %2 d%3   4, -1
 %endmacro
 
-QPEL_TABLE  8, 8, b, sse4
-QPEL_TABLE 10, 4, w, sse4
-QPEL_TABLE 12, 4, w, sse4
+QPEL_TABLE  8, 8, b, ssse3
+QPEL_TABLE 10, 4, w, ssse3
+QPEL_TABLE 12, 4, w, ssse3
 
 %define MAX_PB_SIZE  64
 
-%define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
+%define hevc_qpel_filters_ssse3_14 hevc_qpel_filters_ssse3_10
 
 %if ARCH_X86_64
 
@@ -121,9 +123,9 @@ QPEL_TABLE 12, 4, w, sse4
 
 %macro EPEL_FILTER 2-4                            ; bit depth, filter index
 %ifdef PIC
-    lea         rfilterq, [hevc_epel_filters_sse4_%1]
+    lea         rfilterq, [hevc_epel_filters_ssse3_%1]
 %else
-    %define rfilterq hevc_epel_filters_sse4_%1
+    %define rfilterq hevc_epel_filters_ssse3_%1
 %endif
     sub              %2q, 1
     shl              %2q, 5                      ; multiply by 32
@@ -138,9 +140,9 @@ QPEL_TABLE 12, 4, w, sse4
 
 %macro EPEL_HV_FILTER 1
 %ifdef PIC
-    lea         rfilterq, [hevc_epel_filters_sse4_%1]
+    lea         rfilterq, [hevc_epel_filters_ssse3_%1]
 %else
-    %define rfilterq hevc_epel_filters_sse4_%1
+    %define rfilterq hevc_epel_filters_ssse3_%1
 %endif
     sub              mxq, 1
     sub              myq, 1
@@ -151,9 +153,9 @@ QPEL_TABLE 12, 4, w, sse4
     lea           r3srcq, [srcstrideq*3]
 
 %ifdef PIC
-    lea         rfilterq, [hevc_epel_filters_sse4_10]
+    lea         rfilterq, [hevc_epel_filters_ssse3_10]
 %else
-    %define rfilterq hevc_epel_filters_sse4_10
+    %define rfilterq hevc_epel_filters_ssse3_10
 %endif
     movdqa           m12, [rfilterq + myq]        ; get 2 first values of filters
     movdqa           m13, [rfilterq + myq+16]     ; get 2 last values of filters
@@ -161,9 +163,9 @@ QPEL_TABLE 12, 4, w, sse4
 
 %macro QPEL_FILTER 2
 %ifdef PIC
-    lea         rfilterq, [hevc_qpel_filters_sse4_%1]
+    lea         rfilterq, [hevc_qpel_filters_ssse3_%1]
 %else
-    %define rfilterq hevc_qpel_filters_sse4_%1
+    %define rfilterq hevc_qpel_filters_ssse3_%1
 %endif
     lea              %2q, [%2q*8-8]
     movdqa           m12, [rfilterq + %2q*8]       ; get 4 first values of filters
@@ -357,14 +359,25 @@ QPEL_TABLE 12, 4, w, sse4
 %endmacro
 
 %macro PEL_8STORE2 3
+%if cpuflag(sse4)
     pextrw          [%1], %2, 0
+%else
+    movd        rfilterd, %2
+    mov             [%1], rfilterw
+%endif
 %endmacro
 %macro PEL_8STORE4 3
     movd            [%1], %2
 %endmacro
 %macro PEL_8STORE6 3
     movd            [%1], %2
+%if cpuflag(sse4)
     pextrw        [%1+4], %2, 2
+%else
+    psrldq            %2, 4
+    movd        rfilterd, %2
+    mov           [%1+4], rfilterw
+%endif
 %endmacro
 %macro PEL_8STORE8 3
     movq           [%1], %2
@@ -426,7 +439,7 @@ QPEL_TABLE 12, 4, w, sse4
 %endmacro
 
 %macro QPEL_SET_POINTER  2
-    lea         rfilterq, [hevc_qpel_filters_sse4_%1]
+    lea         rfilterq, [hevc_qpel_filters_ssse3_%1]
     lea              %2q, [rfilterq + 8*%2q]
 %endmacro
 
@@ -535,7 +548,6 @@ QPEL_TABLE 12, 4, w, sse4
 %endif
 %endmacro
 
-INIT_XMM sse4                                    ; adds ff_ and _sse4 to function name
 ; ******************************
 ; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
 ;                         uint8_t *_src, ptrdiff_t _srcstride,
@@ -543,7 +555,7 @@ INIT_XMM sse4                                    ; adds ff_ and _sse4 to functio
 ; ******************************
 
 %macro HEVC_PUT_HEVC_PEL_PIXELS 2
-cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
+cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 5, 3, dst, src, srcstride, height, rfilter
     pxor               m2, m2
 .loop
     SIMPLE_LOAD       %1, %2, srcq, m0
@@ -552,7 +564,7 @@ cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
     LOOP_END         dst, src, srcstride
     RET
 
-cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height
+cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 6, 2, dst, dststride, src, srcstride, height, rfilter
 .loop
     SIMPLE_LOAD       %1, %2, srcq, m0
     PEL_%2STORE%1   dstq, m0, m1
@@ -562,7 +574,7 @@ cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstri
     jnz               .loop                      ; height loop
     RET
 
-cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
+cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 7, 6, dst, dststride, src, srcstride, src2, height, rfilter
     pxor              m2, m2
     movdqa            m5, [pw_bi_%2]
 .loop
@@ -1154,14 +1166,29 @@ cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride
     RET
 %endmacro
 
+%macro PACKUSDW 2
+%if cpuflag(sse4)
+    packusdw %1, %2
+%else
+    psubd    %1, [pd_8000]
+    psubd    %2, [pd_8000]
+    packssdw %1, %2
+    paddw    %1, [pw_8000]
+%endif
+%endmacro
+
 %macro WEIGHTING_FUNCS 2
 %if WIN64 || ARCH_X86_32
-cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, srcstride, height, denom, wx, ox
+cglobal hevc_put_hevc_uni_w%1_%2, 4, 6, 7, dst, dststride, src, srcstride, height, denom, wx, ox
     mov             r4d, denomm
 %define SHIFT  r4d
+%define rfilterd  r5d
+%define rfilterw  r5w
 %else
-cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, srcstride, height, denom, wx, ox
+cglobal hevc_put_hevc_uni_w%1_%2, 6, 7, 7, dst, dststride, src, srcstride, height, denom, wx, ox
 %define SHIFT  denomd
+%define rfilterd  r6d
+%define rfilterw  r6w
 %endif
     lea           SHIFT, [SHIFT+14-%2]          ; shift = 14 - bitd + denom
 %if %1 <= 4
@@ -1208,7 +1235,7 @@ cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, srcstride, heigh
     paddd             m0, m3
     paddd             m1, m3
 %endif
-    packusdw          m0, m1
+    PACKUSDW          m0, m1
 %if %2 == 8
     packuswb          m0, m0
 %else
@@ -1221,7 +1248,9 @@ cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, srcstride, heigh
     jnz               .loop                      ; height loop
     RET
 
-cglobal hevc_put_hevc_bi_w%1_%2, 5, 7, 10, dst, dststride, src, srcstride, src2, height, denom, wx0, wx1, ox0, ox1
+cglobal hevc_put_hevc_bi_w%1_%2, 5, 8, 10, dst, dststride, src, srcstride, src2, height, denom, wx0, wx1, ox0, ox1
+%define rfilterd r7d
+%define rfilterw r7w
     mov              r6d, denomm
 %if %1 <= 4
     pxor              m1, m1
@@ -1279,7 +1308,7 @@ cglobal hevc_put_hevc_bi_w%1_%2, 5, 7, 10, dst, dststride, src, srcstride, src2,
     psrad             m0, m5
     psrad             m1, m5
 %endif
-    packusdw          m0, m1
+    PACKUSDW          m0, m1
 %if %2 == 8
     packuswb          m0, m0
 %else
@@ -1294,6 +1323,7 @@ cglobal hevc_put_hevc_bi_w%1_%2, 5, 7, 10, dst, dststride, src, srcstride, src2,
     RET
 %endmacro
 
+INIT_XMM ssse3
 WEIGHTING_FUNCS 2, 8
 WEIGHTING_FUNCS 4, 8
 WEIGHTING_FUNCS 6, 8
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
index df49269..a652cbf 100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -91,43 +91,43 @@ void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dstst
 ///////////////////////////////////////////////////////////////////////////////
 // QPEL_PIXELS EPEL_PIXELS
 ///////////////////////////////////////////////////////////////////////////////
-EPEL_PROTOTYPES(pel_pixels ,  8, sse4);
-EPEL_PROTOTYPES(pel_pixels , 10, sse4);
-EPEL_PROTOTYPES(pel_pixels , 12, sse4);
+EPEL_PROTOTYPES(pel_pixels ,  8, ssse3);
+EPEL_PROTOTYPES(pel_pixels , 10, ssse3);
+EPEL_PROTOTYPES(pel_pixels , 12, ssse3);
 ///////////////////////////////////////////////////////////////////////////////
 // EPEL
 ///////////////////////////////////////////////////////////////////////////////
-EPEL_PROTOTYPES(epel_h ,  8, sse4);
-EPEL_PROTOTYPES(epel_h , 10, sse4);
-EPEL_PROTOTYPES(epel_h , 12, sse4);
+EPEL_PROTOTYPES(epel_h ,  8, ssse3);
+EPEL_PROTOTYPES(epel_h , 10, ssse3);
+EPEL_PROTOTYPES(epel_h , 12, ssse3);
 
-EPEL_PROTOTYPES(epel_v ,  8, sse4);
-EPEL_PROTOTYPES(epel_v , 10, sse4);
-EPEL_PROTOTYPES(epel_v , 12, sse4);
+EPEL_PROTOTYPES(epel_v ,  8, ssse3);
+EPEL_PROTOTYPES(epel_v , 10, ssse3);
+EPEL_PROTOTYPES(epel_v , 12, ssse3);
 
-EPEL_PROTOTYPES(epel_hv ,  8, sse4);
-EPEL_PROTOTYPES(epel_hv , 10, sse4);
-EPEL_PROTOTYPES(epel_hv , 12, sse4);
+EPEL_PROTOTYPES(epel_hv ,  8, ssse3);
+EPEL_PROTOTYPES(epel_hv , 10, ssse3);
+EPEL_PROTOTYPES(epel_hv , 12, ssse3);
 
 ///////////////////////////////////////////////////////////////////////////////
 // QPEL
 ///////////////////////////////////////////////////////////////////////////////
-QPEL_PROTOTYPES(qpel_h ,  8, sse4);
-QPEL_PROTOTYPES(qpel_h , 10, sse4);
-QPEL_PROTOTYPES(qpel_h , 12, sse4);
+QPEL_PROTOTYPES(qpel_h ,  8, ssse3);
+QPEL_PROTOTYPES(qpel_h , 10, ssse3);
+QPEL_PROTOTYPES(qpel_h , 12, ssse3);
 
-QPEL_PROTOTYPES(qpel_v,  8, sse4);
-QPEL_PROTOTYPES(qpel_v, 10, sse4);
-QPEL_PROTOTYPES(qpel_v, 12, sse4);
+QPEL_PROTOTYPES(qpel_v,  8, ssse3);
+QPEL_PROTOTYPES(qpel_v, 10, ssse3);
+QPEL_PROTOTYPES(qpel_v, 12, ssse3);
 
-QPEL_PROTOTYPES(qpel_hv,  8, sse4);
-QPEL_PROTOTYPES(qpel_hv, 10, sse4);
-QPEL_PROTOTYPES(qpel_hv, 12, sse4);
+QPEL_PROTOTYPES(qpel_hv,  8, ssse3);
+QPEL_PROTOTYPES(qpel_hv, 10, ssse3);
+QPEL_PROTOTYPES(qpel_hv, 12, ssse3);
 
 
-WEIGHTING_PROTOTYPES(8, sse4);
-WEIGHTING_PROTOTYPES(10, sse4);
-WEIGHTING_PROTOTYPES(12, sse4);
+WEIGHTING_PROTOTYPES(8, ssse3);
+WEIGHTING_PROTOTYPES(10, ssse3);
+WEIGHTING_PROTOTYPES(12, ssse3);
 
 ///////////////////////////////////////////////////////////////////////////////
 // TRANSFORM_ADD
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index acb82c4..95f69e0 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -163,126 +163,126 @@ void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dsts
     mc_rep_uni_func2(name, bitd, step1, step2, W, opt); \
     mc_rep_bi_func2(name, bitd, step1, step2, W, opt)
 
-#if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
-
-mc_rep_funcs(pel_pixels, 8, 16, 64, sse4);
-mc_rep_funcs(pel_pixels, 8, 16, 48, sse4);
-mc_rep_funcs(pel_pixels, 8, 16, 32, sse4);
-mc_rep_funcs(pel_pixels, 8,  8, 24, sse4);
-mc_rep_funcs(pel_pixels,10,  8, 64, sse4);
-mc_rep_funcs(pel_pixels,10,  8, 48, sse4);
-mc_rep_funcs(pel_pixels,10,  8, 32, sse4);
-mc_rep_funcs(pel_pixels,10,  8, 24, sse4);
-mc_rep_funcs(pel_pixels,10,  8, 16, sse4);
-mc_rep_funcs(pel_pixels,10,  4, 12, sse4);
-mc_rep_funcs(pel_pixels,12,  8, 64, sse4);
-mc_rep_funcs(pel_pixels,12,  8, 48, sse4);
-mc_rep_funcs(pel_pixels,12,  8, 32, sse4);
-mc_rep_funcs(pel_pixels,12,  8, 24, sse4);
-mc_rep_funcs(pel_pixels,12,  8, 16, sse4);
-mc_rep_funcs(pel_pixels,12,  4, 12, sse4);
-
-mc_rep_funcs(epel_h, 8, 16, 64, sse4);
-mc_rep_funcs(epel_h, 8, 16, 48, sse4);
-mc_rep_funcs(epel_h, 8, 16, 32, sse4);
-mc_rep_funcs(epel_h, 8,  8, 24, sse4);
-mc_rep_funcs(epel_h,10,  8, 64, sse4);
-mc_rep_funcs(epel_h,10,  8, 48, sse4);
-mc_rep_funcs(epel_h,10,  8, 32, sse4);
-mc_rep_funcs(epel_h,10,  8, 24, sse4);
-mc_rep_funcs(epel_h,10,  8, 16, sse4);
-mc_rep_funcs(epel_h,10,  4, 12, sse4);
-mc_rep_funcs(epel_h,12,  8, 64, sse4);
-mc_rep_funcs(epel_h,12,  8, 48, sse4);
-mc_rep_funcs(epel_h,12,  8, 32, sse4);
-mc_rep_funcs(epel_h,12,  8, 24, sse4);
-mc_rep_funcs(epel_h,12,  8, 16, sse4);
-mc_rep_funcs(epel_h,12,  4, 12, sse4);
-mc_rep_funcs(epel_v, 8, 16, 64, sse4);
-mc_rep_funcs(epel_v, 8, 16, 48, sse4);
-mc_rep_funcs(epel_v, 8, 16, 32, sse4);
-mc_rep_funcs(epel_v, 8,  8, 24, sse4);
-mc_rep_funcs(epel_v,10,  8, 64, sse4);
-mc_rep_funcs(epel_v,10,  8, 48, sse4);
-mc_rep_funcs(epel_v,10,  8, 32, sse4);
-mc_rep_funcs(epel_v,10,  8, 24, sse4);
-mc_rep_funcs(epel_v,10,  8, 16, sse4);
-mc_rep_funcs(epel_v,10,  4, 12, sse4);
-mc_rep_funcs(epel_v,12,  8, 64, sse4);
-mc_rep_funcs(epel_v,12,  8, 48, sse4);
-mc_rep_funcs(epel_v,12,  8, 32, sse4);
-mc_rep_funcs(epel_v,12,  8, 24, sse4);
-mc_rep_funcs(epel_v,12,  8, 16, sse4);
-mc_rep_funcs(epel_v,12,  4, 12, sse4);
-mc_rep_funcs(epel_hv, 8,  8, 64, sse4);
-mc_rep_funcs(epel_hv, 8,  8, 48, sse4);
-mc_rep_funcs(epel_hv, 8,  8, 32, sse4);
-mc_rep_funcs(epel_hv, 8,  8, 24, sse4);
-mc_rep_funcs(epel_hv, 8,  8, 16, sse4);
-mc_rep_funcs2(epel_hv,8,  8,  4, 12, sse4);
-mc_rep_funcs(epel_hv,10,  8, 64, sse4);
-mc_rep_funcs(epel_hv,10,  8, 48, sse4);
-mc_rep_funcs(epel_hv,10,  8, 32, sse4);
-mc_rep_funcs(epel_hv,10,  8, 24, sse4);
-mc_rep_funcs(epel_hv,10,  8, 16, sse4);
-mc_rep_funcs(epel_hv,10,  4, 12, sse4);
-mc_rep_funcs(epel_hv,12,  8, 64, sse4);
-mc_rep_funcs(epel_hv,12,  8, 48, sse4);
-mc_rep_funcs(epel_hv,12,  8, 32, sse4);
-mc_rep_funcs(epel_hv,12,  8, 24, sse4);
-mc_rep_funcs(epel_hv,12,  8, 16, sse4);
-mc_rep_funcs(epel_hv,12,  4, 12, sse4);
-
-mc_rep_funcs(qpel_h, 8, 16, 64, sse4);
-mc_rep_funcs(qpel_h, 8, 16, 48, sse4);
-mc_rep_funcs(qpel_h, 8, 16, 32, sse4);
-mc_rep_funcs(qpel_h, 8,  8, 24, sse4);
-mc_rep_funcs(qpel_h,10,  8, 64, sse4);
-mc_rep_funcs(qpel_h,10,  8, 48, sse4);
-mc_rep_funcs(qpel_h,10,  8, 32, sse4);
-mc_rep_funcs(qpel_h,10,  8, 24, sse4);
-mc_rep_funcs(qpel_h,10,  8, 16, sse4);
-mc_rep_funcs(qpel_h,10,  4, 12, sse4);
-mc_rep_funcs(qpel_h,12,  8, 64, sse4);
-mc_rep_funcs(qpel_h,12,  8, 48, sse4);
-mc_rep_funcs(qpel_h,12,  8, 32, sse4);
-mc_rep_funcs(qpel_h,12,  8, 24, sse4);
-mc_rep_funcs(qpel_h,12,  8, 16, sse4);
-mc_rep_funcs(qpel_h,12,  4, 12, sse4);
-mc_rep_funcs(qpel_v, 8, 16, 64, sse4);
-mc_rep_funcs(qpel_v, 8, 16, 48, sse4);
-mc_rep_funcs(qpel_v, 8, 16, 32, sse4);
-mc_rep_funcs(qpel_v, 8,  8, 24, sse4);
-mc_rep_funcs(qpel_v,10,  8, 64, sse4);
-mc_rep_funcs(qpel_v,10,  8, 48, sse4);
-mc_rep_funcs(qpel_v,10,  8, 32, sse4);
-mc_rep_funcs(qpel_v,10,  8, 24, sse4);
-mc_rep_funcs(qpel_v,10,  8, 16, sse4);
-mc_rep_funcs(qpel_v,10,  4, 12, sse4);
-mc_rep_funcs(qpel_v,12,  8, 64, sse4);
-mc_rep_funcs(qpel_v,12,  8, 48, sse4);
-mc_rep_funcs(qpel_v,12,  8, 32, sse4);
-mc_rep_funcs(qpel_v,12,  8, 24, sse4);
-mc_rep_funcs(qpel_v,12,  8, 16, sse4);
-mc_rep_funcs(qpel_v,12,  4, 12, sse4);
-mc_rep_funcs(qpel_hv, 8,  8, 64, sse4);
-mc_rep_funcs(qpel_hv, 8,  8, 48, sse4);
-mc_rep_funcs(qpel_hv, 8,  8, 32, sse4);
-mc_rep_funcs(qpel_hv, 8,  8, 24, sse4);
-mc_rep_funcs(qpel_hv, 8,  8, 16, sse4);
-mc_rep_funcs2(qpel_hv,8,  8,  4, 12, sse4);
-mc_rep_funcs(qpel_hv,10,  8, 64, sse4);
-mc_rep_funcs(qpel_hv,10,  8, 48, sse4);
-mc_rep_funcs(qpel_hv,10,  8, 32, sse4);
-mc_rep_funcs(qpel_hv,10,  8, 24, sse4);
-mc_rep_funcs(qpel_hv,10,  8, 16, sse4);
-mc_rep_funcs(qpel_hv,10,  4, 12, sse4);
-mc_rep_funcs(qpel_hv,12,  8, 64, sse4);
-mc_rep_funcs(qpel_hv,12,  8, 48, sse4);
-mc_rep_funcs(qpel_hv,12,  8, 32, sse4);
-mc_rep_funcs(qpel_hv,12,  8, 24, sse4);
-mc_rep_funcs(qpel_hv,12,  8, 16, sse4);
-mc_rep_funcs(qpel_hv,12,  4, 12, sse4);
+#if ARCH_X86_64 && HAVE_SSSE3_EXTERNAL
+
+mc_rep_funcs(pel_pixels, 8, 16, 64, ssse3);
+mc_rep_funcs(pel_pixels, 8, 16, 48, ssse3);
+mc_rep_funcs(pel_pixels, 8, 16, 32, ssse3);
+mc_rep_funcs(pel_pixels, 8,  8, 24, ssse3);
+mc_rep_funcs(pel_pixels,10,  8, 64, ssse3);
+mc_rep_funcs(pel_pixels,10,  8, 48, ssse3);
+mc_rep_funcs(pel_pixels,10,  8, 32, ssse3);
+mc_rep_funcs(pel_pixels,10,  8, 24, ssse3);
+mc_rep_funcs(pel_pixels,10,  8, 16, ssse3);
+mc_rep_funcs(pel_pixels,10,  4, 12, ssse3);
+mc_rep_funcs(pel_pixels,12,  8, 64, ssse3);
+mc_rep_funcs(pel_pixels,12,  8, 48, ssse3);
+mc_rep_funcs(pel_pixels,12,  8, 32, ssse3);
+mc_rep_funcs(pel_pixels,12,  8, 24, ssse3);
+mc_rep_funcs(pel_pixels,12,  8, 16, ssse3);
+mc_rep_funcs(pel_pixels,12,  4, 12, ssse3);
+
+mc_rep_funcs(epel_h, 8, 16, 64, ssse3);
+mc_rep_funcs(epel_h, 8, 16, 48, ssse3);
+mc_rep_funcs(epel_h, 8, 16, 32, ssse3);
+mc_rep_funcs(epel_h, 8,  8, 24, ssse3);
+mc_rep_funcs(epel_h,10,  8, 64, ssse3);
+mc_rep_funcs(epel_h,10,  8, 48, ssse3);
+mc_rep_funcs(epel_h,10,  8, 32, ssse3);
+mc_rep_funcs(epel_h,10,  8, 24, ssse3);
+mc_rep_funcs(epel_h,10,  8, 16, ssse3);
+mc_rep_funcs(epel_h,10,  4, 12, ssse3);
+mc_rep_funcs(epel_h,12,  8, 64, ssse3);
+mc_rep_funcs(epel_h,12,  8, 48, ssse3);
+mc_rep_funcs(epel_h,12,  8, 32, ssse3);
+mc_rep_funcs(epel_h,12,  8, 24, ssse3);
+mc_rep_funcs(epel_h,12,  8, 16, ssse3);
+mc_rep_funcs(epel_h,12,  4, 12, ssse3);
+mc_rep_funcs(epel_v, 8, 16, 64, ssse3);
+mc_rep_funcs(epel_v, 8, 16, 48, ssse3);
+mc_rep_funcs(epel_v, 8, 16, 32, ssse3);
+mc_rep_funcs(epel_v, 8,  8, 24, ssse3);
+mc_rep_funcs(epel_v,10,  8, 64, ssse3);
+mc_rep_funcs(epel_v,10,  8, 48, ssse3);
+mc_rep_funcs(epel_v,10,  8, 32, ssse3);
+mc_rep_funcs(epel_v,10,  8, 24, ssse3);
+mc_rep_funcs(epel_v,10,  8, 16, ssse3);
+mc_rep_funcs(epel_v,10,  4, 12, ssse3);
+mc_rep_funcs(epel_v,12,  8, 64, ssse3);
+mc_rep_funcs(epel_v,12,  8, 48, ssse3);
+mc_rep_funcs(epel_v,12,  8, 32, ssse3);
+mc_rep_funcs(epel_v,12,  8, 24, ssse3);
+mc_rep_funcs(epel_v,12,  8, 16, ssse3);
+mc_rep_funcs(epel_v,12,  4, 12, ssse3);
+mc_rep_funcs(epel_hv, 8,  8, 64, ssse3);
+mc_rep_funcs(epel_hv, 8,  8, 48, ssse3);
+mc_rep_funcs(epel_hv, 8,  8, 32, ssse3);
+mc_rep_funcs(epel_hv, 8,  8, 24, ssse3);
+mc_rep_funcs(epel_hv, 8,  8, 16, ssse3);
+mc_rep_funcs2(epel_hv,8,  8,  4, 12, ssse3);
+mc_rep_funcs(epel_hv,10,  8, 64, ssse3);
+mc_rep_funcs(epel_hv,10,  8, 48, ssse3);
+mc_rep_funcs(epel_hv,10,  8, 32, ssse3);
+mc_rep_funcs(epel_hv,10,  8, 24, ssse3);
+mc_rep_funcs(epel_hv,10,  8, 16, ssse3);
+mc_rep_funcs(epel_hv,10,  4, 12, ssse3);
+mc_rep_funcs(epel_hv,12,  8, 64, ssse3);
+mc_rep_funcs(epel_hv,12,  8, 48, ssse3);
+mc_rep_funcs(epel_hv,12,  8, 32, ssse3);
+mc_rep_funcs(epel_hv,12,  8, 24, ssse3);
+mc_rep_funcs(epel_hv,12,  8, 16, ssse3);
+mc_rep_funcs(epel_hv,12,  4, 12, ssse3);
+
+mc_rep_funcs(qpel_h, 8, 16, 64, ssse3);
+mc_rep_funcs(qpel_h, 8, 16, 48, ssse3);
+mc_rep_funcs(qpel_h, 8, 16, 32, ssse3);
+mc_rep_funcs(qpel_h, 8,  8, 24, ssse3);
+mc_rep_funcs(qpel_h,10,  8, 64, ssse3);
+mc_rep_funcs(qpel_h,10,  8, 48, ssse3);
+mc_rep_funcs(qpel_h,10,  8, 32, ssse3);
+mc_rep_funcs(qpel_h,10,  8, 24, ssse3);
+mc_rep_funcs(qpel_h,10,  8, 16, ssse3);
+mc_rep_funcs(qpel_h,10,  4, 12, ssse3);
+mc_rep_funcs(qpel_h,12,  8, 64, ssse3);
+mc_rep_funcs(qpel_h,12,  8, 48, ssse3);
+mc_rep_funcs(qpel_h,12,  8, 32, ssse3);
+mc_rep_funcs(qpel_h,12,  8, 24, ssse3);
+mc_rep_funcs(qpel_h,12,  8, 16, ssse3);
+mc_rep_funcs(qpel_h,12,  4, 12, ssse3);
+mc_rep_funcs(qpel_v, 8, 16, 64, ssse3);
+mc_rep_funcs(qpel_v, 8, 16, 48, ssse3);
+mc_rep_funcs(qpel_v, 8, 16, 32, ssse3);
+mc_rep_funcs(qpel_v, 8,  8, 24, ssse3);
+mc_rep_funcs(qpel_v,10,  8, 64, ssse3);
+mc_rep_funcs(qpel_v,10,  8, 48, ssse3);
+mc_rep_funcs(qpel_v,10,  8, 32, ssse3);
+mc_rep_funcs(qpel_v,10,  8, 24, ssse3);
+mc_rep_funcs(qpel_v,10,  8, 16, ssse3);
+mc_rep_funcs(qpel_v,10,  4, 12, ssse3);
+mc_rep_funcs(qpel_v,12,  8, 64, ssse3);
+mc_rep_funcs(qpel_v,12,  8, 48, ssse3);
+mc_rep_funcs(qpel_v,12,  8, 32, ssse3);
+mc_rep_funcs(qpel_v,12,  8, 24, ssse3);
+mc_rep_funcs(qpel_v,12,  8, 16, ssse3);
+mc_rep_funcs(qpel_v,12,  4, 12, ssse3);
+mc_rep_funcs(qpel_hv, 8,  8, 64, ssse3);
+mc_rep_funcs(qpel_hv, 8,  8, 48, ssse3);
+mc_rep_funcs(qpel_hv, 8,  8, 32, ssse3);
+mc_rep_funcs(qpel_hv, 8,  8, 24, ssse3);
+mc_rep_funcs(qpel_hv, 8,  8, 16, ssse3);
+mc_rep_funcs2(qpel_hv,8,  8,  4, 12, ssse3);
+mc_rep_funcs(qpel_hv,10,  8, 64, ssse3);
+mc_rep_funcs(qpel_hv,10,  8, 48, ssse3);
+mc_rep_funcs(qpel_hv,10,  8, 32, ssse3);
+mc_rep_funcs(qpel_hv,10,  8, 24, ssse3);
+mc_rep_funcs(qpel_hv,10,  8, 16, ssse3);
+mc_rep_funcs(qpel_hv,10,  4, 12, ssse3);
+mc_rep_funcs(qpel_hv,12,  8, 64, ssse3);
+mc_rep_funcs(qpel_hv,12,  8, 48, ssse3);
+mc_rep_funcs(qpel_hv,12,  8, 32, ssse3);
+mc_rep_funcs(qpel_hv,12,  8, 24, ssse3);
+mc_rep_funcs(qpel_hv,12,  8, 16, ssse3);
+mc_rep_funcs(qpel_hv,12,  4, 12, ssse3);
 
 #define mc_rep_uni_w(bitd, step, W, opt) \
 void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride,\
@@ -299,26 +299,26 @@ void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststri
     }                                                                                                                   \
 }
 
-mc_rep_uni_w(8, 6, 12, sse4);
-mc_rep_uni_w(8, 8, 16, sse4);
-mc_rep_uni_w(8, 8, 24, sse4);
-mc_rep_uni_w(8, 8, 32, sse4);
-mc_rep_uni_w(8, 8, 48, sse4);
-mc_rep_uni_w(8, 8, 64, sse4);
-
-mc_rep_uni_w(10, 6, 12, sse4);
-mc_rep_uni_w(10, 8, 16, sse4);
-mc_rep_uni_w(10, 8, 24, sse4);
-mc_rep_uni_w(10, 8, 32, sse4);
-mc_rep_uni_w(10, 8, 48, sse4);
-mc_rep_uni_w(10, 8, 64, sse4);
-
-mc_rep_uni_w(12, 6, 12, sse4);
-mc_rep_uni_w(12, 8, 16, sse4);
-mc_rep_uni_w(12, 8, 24, sse4);
-mc_rep_uni_w(12, 8, 32, sse4);
-mc_rep_uni_w(12, 8, 48, sse4);
-mc_rep_uni_w(12, 8, 64, sse4);
+mc_rep_uni_w(8, 6, 12, ssse3);
+mc_rep_uni_w(8, 8, 16, ssse3);
+mc_rep_uni_w(8, 8, 24, ssse3);
+mc_rep_uni_w(8, 8, 32, ssse3);
+mc_rep_uni_w(8, 8, 48, ssse3);
+mc_rep_uni_w(8, 8, 64, ssse3);
+
+mc_rep_uni_w(10, 6, 12, ssse3);
+mc_rep_uni_w(10, 8, 16, ssse3);
+mc_rep_uni_w(10, 8, 24, ssse3);
+mc_rep_uni_w(10, 8, 32, ssse3);
+mc_rep_uni_w(10, 8, 48, ssse3);
+mc_rep_uni_w(10, 8, 64, ssse3);
+
+mc_rep_uni_w(12, 6, 12, ssse3);
+mc_rep_uni_w(12, 8, 16, ssse3);
+mc_rep_uni_w(12, 8, 24, ssse3);
+mc_rep_uni_w(12, 8, 32, ssse3);
+mc_rep_uni_w(12, 8, 48, ssse3);
+mc_rep_uni_w(12, 8, 64, ssse3);
 
 #define mc_rep_bi_w(bitd, step, W, opt) \
 void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride, \
@@ -338,26 +338,26 @@ void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststrid
     }                                                                                                                   \
 }
 
-mc_rep_bi_w(8, 6, 12, sse4);
-mc_rep_bi_w(8, 8, 16, sse4);
-mc_rep_bi_w(8, 8, 24, sse4);
-mc_rep_bi_w(8, 8, 32, sse4);
-mc_rep_bi_w(8, 8, 48, sse4);
-mc_rep_bi_w(8, 8, 64, sse4);
-
-mc_rep_bi_w(10, 6, 12, sse4);
-mc_rep_bi_w(10, 8, 16, sse4);
-mc_rep_bi_w(10, 8, 24, sse4);
-mc_rep_bi_w(10, 8, 32, sse4);
-mc_rep_bi_w(10, 8, 48, sse4);
-mc_rep_bi_w(10, 8, 64, sse4);
-
-mc_rep_bi_w(12, 6, 12, sse4);
-mc_rep_bi_w(12, 8, 16, sse4);
-mc_rep_bi_w(12, 8, 24, sse4);
-mc_rep_bi_w(12, 8, 32, sse4);
-mc_rep_bi_w(12, 8, 48, sse4);
-mc_rep_bi_w(12, 8, 64, sse4);
+mc_rep_bi_w(8, 6, 12, ssse3);
+mc_rep_bi_w(8, 8, 16, ssse3);
+mc_rep_bi_w(8, 8, 24, ssse3);
+mc_rep_bi_w(8, 8, 32, ssse3);
+mc_rep_bi_w(8, 8, 48, ssse3);
+mc_rep_bi_w(8, 8, 64, ssse3);
+
+mc_rep_bi_w(10, 6, 12, ssse3);
+mc_rep_bi_w(10, 8, 16, ssse3);
+mc_rep_bi_w(10, 8, 24, ssse3);
+mc_rep_bi_w(10, 8, 32, ssse3);
+mc_rep_bi_w(10, 8, 48, ssse3);
+mc_rep_bi_w(10, 8, 64, ssse3);
+
+mc_rep_bi_w(12, 6, 12, ssse3);
+mc_rep_bi_w(12, 8, 16, ssse3);
+mc_rep_bi_w(12, 8, 24, ssse3);
+mc_rep_bi_w(12, 8, 32, ssse3);
+mc_rep_bi_w(12, 8, 48, ssse3);
+mc_rep_bi_w(12, 8, 64, ssse3);
 
 #define mc_uni_w_func(name, bitd, W, opt) \
 static void put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride,         \
@@ -381,41 +381,41 @@ static void put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _
         mc_uni_w_func(name, bitd, 48, opt);   \
         mc_uni_w_func(name, bitd, 64, opt)
 
-mc_uni_w_funcs(pel_pixels, 8, sse4);
-mc_uni_w_func(pel_pixels, 8, 6, sse4);
-mc_uni_w_funcs(epel_h, 8, sse4);
-mc_uni_w_func(epel_h, 8, 6, sse4);
-mc_uni_w_funcs(epel_v, 8, sse4);
-mc_uni_w_func(epel_v, 8, 6, sse4);
-mc_uni_w_funcs(epel_hv, 8, sse4);
-mc_uni_w_func(epel_hv, 8, 6, sse4);
-mc_uni_w_funcs(qpel_h, 8, sse4);
-mc_uni_w_funcs(qpel_v, 8, sse4);
-mc_uni_w_funcs(qpel_hv, 8, sse4);
-
-mc_uni_w_funcs(pel_pixels, 10, sse4);
-mc_uni_w_func(pel_pixels, 10, 6, sse4);
-mc_uni_w_funcs(epel_h, 10, sse4);
-mc_uni_w_func(epel_h, 10, 6, sse4);
-mc_uni_w_funcs(epel_v, 10, sse4);
-mc_uni_w_func(epel_v, 10, 6, sse4);
-mc_uni_w_funcs(epel_hv, 10, sse4);
-mc_uni_w_func(epel_hv, 10, 6, sse4);
-mc_uni_w_funcs(qpel_h, 10, sse4);
-mc_uni_w_funcs(qpel_v, 10, sse4);
-mc_uni_w_funcs(qpel_hv, 10, sse4);
-
-mc_uni_w_funcs(pel_pixels, 12, sse4);
-mc_uni_w_func(pel_pixels, 12, 6, sse4);
-mc_uni_w_funcs(epel_h, 12, sse4);
-mc_uni_w_func(epel_h, 12, 6, sse4);
-mc_uni_w_funcs(epel_v, 12, sse4);
-mc_uni_w_func(epel_v, 12, 6, sse4);
-mc_uni_w_funcs(epel_hv, 12, sse4);
-mc_uni_w_func(epel_hv, 12, 6, sse4);
-mc_uni_w_funcs(qpel_h, 12, sse4);
-mc_uni_w_funcs(qpel_v, 12, sse4);
-mc_uni_w_funcs(qpel_hv, 12, sse4);
+mc_uni_w_funcs(pel_pixels, 8, ssse3);
+mc_uni_w_func(pel_pixels, 8, 6, ssse3);
+mc_uni_w_funcs(epel_h, 8, ssse3);
+mc_uni_w_func(epel_h, 8, 6, ssse3);
+mc_uni_w_funcs(epel_v, 8, ssse3);
+mc_uni_w_func(epel_v, 8, 6, ssse3);
+mc_uni_w_funcs(epel_hv, 8, ssse3);
+mc_uni_w_func(epel_hv, 8, 6, ssse3);
+mc_uni_w_funcs(qpel_h, 8, ssse3);
+mc_uni_w_funcs(qpel_v, 8, ssse3);
+mc_uni_w_funcs(qpel_hv, 8, ssse3);
+
+mc_uni_w_funcs(pel_pixels, 10, ssse3);
+mc_uni_w_func(pel_pixels, 10, 6, ssse3);
+mc_uni_w_funcs(epel_h, 10, ssse3);
+mc_uni_w_func(epel_h, 10, 6, ssse3);
+mc_uni_w_funcs(epel_v, 10, ssse3);
+mc_uni_w_func(epel_v, 10, 6, ssse3);
+mc_uni_w_funcs(epel_hv, 10, ssse3);
+mc_uni_w_func(epel_hv, 10, 6, ssse3);
+mc_uni_w_funcs(qpel_h, 10, ssse3);
+mc_uni_w_funcs(qpel_v, 10, ssse3);
+mc_uni_w_funcs(qpel_hv, 10, ssse3);
+
+mc_uni_w_funcs(pel_pixels, 12, ssse3);
+mc_uni_w_func(pel_pixels, 12, 6, ssse3);
+mc_uni_w_funcs(epel_h, 12, ssse3);
+mc_uni_w_func(epel_h, 12, 6, ssse3);
+mc_uni_w_funcs(epel_v, 12, ssse3);
+mc_uni_w_func(epel_v, 12, 6, ssse3);
+mc_uni_w_funcs(epel_hv, 12, ssse3);
+mc_uni_w_func(epel_hv, 12, 6, ssse3);
+mc_uni_w_funcs(qpel_h, 12, ssse3);
+mc_uni_w_funcs(qpel_v, 12, ssse3);
+mc_uni_w_funcs(qpel_hv, 12, ssse3);
 
 #define mc_bi_w_func(name, bitd, W, opt) \
 static void put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride,           \
@@ -441,42 +441,42 @@ static void put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _d
         mc_bi_w_func(name, bitd, 48, opt);   \
         mc_bi_w_func(name, bitd, 64, opt)
 
-mc_bi_w_funcs(pel_pixels, 8, sse4);
-mc_bi_w_func(pel_pixels, 8, 6, sse4);
-mc_bi_w_funcs(epel_h, 8, sse4);
-mc_bi_w_func(epel_h, 8, 6, sse4);
-mc_bi_w_funcs(epel_v, 8, sse4);
-mc_bi_w_func(epel_v, 8, 6, sse4);
-mc_bi_w_funcs(epel_hv, 8, sse4);
-mc_bi_w_func(epel_hv, 8, 6, sse4);
-mc_bi_w_funcs(qpel_h, 8, sse4);
-mc_bi_w_funcs(qpel_v, 8, sse4);
-mc_bi_w_funcs(qpel_hv, 8, sse4);
-
-mc_bi_w_funcs(pel_pixels, 10, sse4);
-mc_bi_w_func(pel_pixels, 10, 6, sse4);
-mc_bi_w_funcs(epel_h, 10, sse4);
-mc_bi_w_func(epel_h, 10, 6, sse4);
-mc_bi_w_funcs(epel_v, 10, sse4);
-mc_bi_w_func(epel_v, 10, 6, sse4);
-mc_bi_w_funcs(epel_hv, 10, sse4);
-mc_bi_w_func(epel_hv, 10, 6, sse4);
-mc_bi_w_funcs(qpel_h, 10, sse4);
-mc_bi_w_funcs(qpel_v, 10, sse4);
-mc_bi_w_funcs(qpel_hv, 10, sse4);
-
-mc_bi_w_funcs(pel_pixels, 12, sse4);
-mc_bi_w_func(pel_pixels, 12, 6, sse4);
-mc_bi_w_funcs(epel_h, 12, sse4);
-mc_bi_w_func(epel_h, 12, 6, sse4);
-mc_bi_w_funcs(epel_v, 12, sse4);
-mc_bi_w_func(epel_v, 12, 6, sse4);
-mc_bi_w_funcs(epel_hv, 12, sse4);
-mc_bi_w_func(epel_hv, 12, 6, sse4);
-mc_bi_w_funcs(qpel_h, 12, sse4);
-mc_bi_w_funcs(qpel_v, 12, sse4);
-mc_bi_w_funcs(qpel_hv, 12, sse4);
-#endif //ARCH_X86_64 && HAVE_SSE4_EXTERNAL
+mc_bi_w_funcs(pel_pixels, 8, ssse3);
+mc_bi_w_func(pel_pixels, 8, 6, ssse3);
+mc_bi_w_funcs(epel_h, 8, ssse3);
+mc_bi_w_func(epel_h, 8, 6, ssse3);
+mc_bi_w_funcs(epel_v, 8, ssse3);
+mc_bi_w_func(epel_v, 8, 6, ssse3);
+mc_bi_w_funcs(epel_hv, 8, ssse3);
+mc_bi_w_func(epel_hv, 8, 6, ssse3);
+mc_bi_w_funcs(qpel_h, 8, ssse3);
+mc_bi_w_funcs(qpel_v, 8, ssse3);
+mc_bi_w_funcs(qpel_hv, 8, ssse3);
+
+mc_bi_w_funcs(pel_pixels, 10, ssse3);
+mc_bi_w_func(pel_pixels, 10, 6, ssse3);
+mc_bi_w_funcs(epel_h, 10, ssse3);
+mc_bi_w_func(epel_h, 10, 6, ssse3);
+mc_bi_w_funcs(epel_v, 10, ssse3);
+mc_bi_w_func(epel_v, 10, 6, ssse3);
+mc_bi_w_funcs(epel_hv, 10, ssse3);
+mc_bi_w_func(epel_hv, 10, 6, ssse3);
+mc_bi_w_funcs(qpel_h, 10, ssse3);
+mc_bi_w_funcs(qpel_v, 10, ssse3);
+mc_bi_w_funcs(qpel_hv, 10, ssse3);
+
+mc_bi_w_funcs(pel_pixels, 12, ssse3);
+mc_bi_w_func(pel_pixels, 12, 6, ssse3);
+mc_bi_w_funcs(epel_h, 12, ssse3);
+mc_bi_w_func(epel_h, 12, 6, ssse3);
+mc_bi_w_funcs(epel_v, 12, ssse3);
+mc_bi_w_func(epel_v, 12, 6, ssse3);
+mc_bi_w_funcs(epel_hv, 12, ssse3);
+mc_bi_w_func(epel_hv, 12, 6, ssse3);
+mc_bi_w_funcs(qpel_h, 12, ssse3);
+mc_bi_w_funcs(qpel_v, 12, ssse3);
+mc_bi_w_funcs(qpel_hv, 12, ssse3);
+#endif //ARCH_X86_64 && HAVE_SSSE3_EXTERNAL
 
 
 #define EPEL_LINKS(pointer, my, mx, fname, bitd, opt )           \
@@ -528,18 +528,16 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
         if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
             c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
             c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
-        }
-        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
 
-            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels,  8, sse4);
-            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,      8, sse4);
-            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,      8, sse4);
-            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,     8, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 8, ssse3);
+            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     8, ssse3);
+            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     8, ssse3);
+            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    8, ssse3);
 
-            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4);
-            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     8, sse4);
-            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     8, sse4);
-            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    8, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, ssse3);
+            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     8, ssse3);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     8, ssse3);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    8, ssse3);
         }
         if (EXTERNAL_AVX(cpu_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx;
@@ -581,17 +579,16 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
         if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
             c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
             c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
-        }
-        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
-            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4);
-            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     10, sse4);
-            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     10, sse4);
-            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    10, sse4);
-
-            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4);
-            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     10, sse4);
-            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     10, sse4);
-            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    10, sse4);
+
+            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, ssse3);
+            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     10, ssse3);
+            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     10, ssse3);
+            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    10, ssse3);
+
+            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, ssse3);
+            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     10, ssse3);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     10, ssse3);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    10, ssse3);
         }
         if (EXTERNAL_AVX(cpu_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx;
@@ -630,17 +627,16 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
         if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
             c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
             c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3;
-        }
-        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
-            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4);
-            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     12, sse4);
-            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     12, sse4);
-            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    12, sse4);
-
-            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4);
-            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     12, sse4);
-            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     12, sse4);
-            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    12, sse4);
+
+            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, ssse3);
+            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     12, ssse3);
+            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     12, ssse3);
+            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    12, ssse3);
+
+            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, ssse3);
+            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     12, ssse3);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     12, ssse3);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    12, ssse3);
         }
         if (EXTERNAL_AVX(cpu_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx;
-- 
1.9.2.msysgit.0



More information about the ffmpeg-devel mailing list