[FFmpeg-devel] [PATCH] JPEG2000: SSE optimisation of DWT decoding

Nicolas Bertrand nicoinattendu at gmail.com
Fri Oct 6 18:30:57 EEST 2017


From: Maxime Taisant <maximetaisant at hotmail.fr>

---
 libavcodec/jpeg2000dwt.c          |   45 +-
 libavcodec/jpeg2000dwt.h          |    5 +
 libavcodec/x86/jpeg2000dsp.asm    | 1339 +++++++++++++++++++++++++++++++++++++
 libavcodec/x86/jpeg2000dsp_init.c |  119 ++++
 tests/checkasm/jpeg2000dsp.c      |    1 +
 5 files changed, 1496 insertions(+), 13 deletions(-)

diff --git a/libavcodec/jpeg2000dwt.c b/libavcodec/jpeg2000dwt.c
index 55dd5e89b5..1a0c3fc034 100644
--- a/libavcodec/jpeg2000dwt.c
+++ b/libavcodec/jpeg2000dwt.c
@@ -30,6 +30,7 @@
 #include "libavutil/mem.h"
 #include "jpeg2000dwt.h"
 #include "internal.h"
+#include "libavutil/timer.h"
 
 /* Defines for 9/7 DWT lifting parameters.
  * Parameters are in float. */
@@ -558,7 +559,7 @@ int ff_jpeg2000_dwt_init(DWTContext *s, int border[2][2],
         }
     switch (type) {
     case FF_DWT97:
-        s->f_linebuf = av_malloc_array((maxlen + 12), sizeof(*s->f_linebuf));
+        s->f_linebuf = av_malloc_array(4*(maxlen + 12), sizeof(*s->f_linebuf));
         if (!s->f_linebuf)
             return AVERROR(ENOMEM);
         break;
@@ -575,6 +576,11 @@ int ff_jpeg2000_dwt_init(DWTContext *s, int border[2][2],
     default:
         return -1;
     }
+
+    s->sse = 0;
+    if (ARCH_X86)
+        ff_jpeg2000dwt_init_x86(s, type);
+
     return 0;
 }
 
@@ -601,18 +607,31 @@ int ff_dwt_decode(DWTContext *s, void *t)
     if (s->ndeclevels == 0)
         return 0;
 
-    switch (s->type) {
-    case FF_DWT97:
-        dwt_decode97_float(s, t);
-        break;
-    case FF_DWT97_INT:
-        dwt_decode97_int(s, t);
-        break;
-    case FF_DWT53:
-        dwt_decode53(s, t);
-        break;
-    default:
-        return -1;
+    switch(s->type){
+        case FF_DWT97:
+            if (s->sse)
+            //{
+            //    START_TIMER
+                dwt_decode97_float_sse(s, t);
+            //    STOP_TIMER("dwt_decode97_float_sse");
+            //}
+            else            
+            //{
+            //    START_TIMER
+                dwt_decode97_float(s, t);
+            //    STOP_TIMER("dwt_decode97_float");
+            //}
+            /*{
+                START_TIMER
+                STOP_TIMER("decode_NULL");
+            }*/
+            break;
+        case FF_DWT97_INT:
+            dwt_decode97_int(s, t); break;
+        case FF_DWT53:
+            dwt_decode53(s, t); break;
+        default:
+            return -1;
     }
     return 0;
 }
diff --git a/libavcodec/jpeg2000dwt.h b/libavcodec/jpeg2000dwt.h
index 718d183ac1..622a404b79 100644
--- a/libavcodec/jpeg2000dwt.h
+++ b/libavcodec/jpeg2000dwt.h
@@ -48,6 +48,7 @@ typedef struct DWTContext {
     uint8_t type;                        ///< 0 for 9/7; 1 for 5/3
     int32_t *i_linebuf;                  ///< int buffer used by transform
     float   *f_linebuf;                  ///< float buffer used by transform
+    int sse;
 } DWTContext;
 
 /**
@@ -65,4 +66,8 @@ int ff_dwt_decode(DWTContext *s, void *t);
 
 void ff_dwt_destroy(DWTContext *s);
 
+void dwt_decode97_float_sse(DWTContext *s, float *t);
+
+void ff_jpeg2000dwt_init_x86(DWTContext *s, int type);
+
 #endif /* AVCODEC_JPEG2000DWT_H */
diff --git a/libavcodec/x86/jpeg2000dsp.asm b/libavcodec/x86/jpeg2000dsp.asm
index 56b5fbd606..b5d5b9a04b 100644
--- a/libavcodec/x86/jpeg2000dsp.asm
+++ b/libavcodec/x86/jpeg2000dsp.asm
@@ -2,6 +2,7 @@
 ;* SIMD-optimized JPEG2000 DSP functions
 ;* Copyright (c) 2014 Nicolas Bertrand
 ;* Copyright (c) 2015 James Almer
+;* Copyright (c) 2017 Maxime Taisant
 ;*
 ;* This file is part of FFmpeg.
 ;*
@@ -29,6 +30,16 @@ pf_ict1: times 8 dd 0.34413
 pf_ict2: times 8 dd 0.71414
 pf_ict3: times 8 dd 1.772
 
+F_LFTG_K: dd 1.230174104914001
+F_LFTG_X: dd 0.812893066115961
+
+F_LFTG_ALPHA: times 8 dd 1.586134342059924
+F_LFTG_BETA: times 8 dd 0.052980118572961
+F_LFTG_GAMMA: times 8 dd 0.882911075530934
+F_LFTG_DELTA: times 8 dd 0.443506852043971
+
+TWO: dd 2.0
+
 SECTION .text
 
 ;***********************************************************************
@@ -142,3 +153,1331 @@ RCT_INT
 INIT_YMM avx2
 RCT_INT
 %endif
+
+;***********************************************************************
+; ff_sr_ld97_float_<opt>(float *line, int i0, int i1)
+;***********************************************************************
+%macro SR1D97FLOAT 1
+cglobal sr_1d97_float, 3, 5, %1, line, i0, i1, j0, j1
+    mov   j0q, i0q
+    mov   j1q, i1q
+    add   j0q, 1
+    cmp   j1q, j0q
+    jg %%extend
+    sub   j0q, 2
+    jnz %%else
+    movss  m0, [lineq+4]
+    movss  m1, [F_LFTG_K]
+    movss  m2, [TWO]
+    divss  m1, m2
+    mulss  m0, m1
+    movss  [lineq+4], m0
+    jmp %%end
+
+%%else:
+    movss  m0, [lineq]
+    movss  m1, [F_LFTG_X]
+    mulss  m0, m1
+    movss [lineq], m0
+    jmp %%end
+
+%%extend:
+    shl   i0d, 2
+    shl   i1d, 2
+    mov   j0q, i0q
+    mov   j1q, i1q
+    movups m0, [lineq+j0q+4]
+    shufps m0, m0, q0123
+    movups [lineq+j0q-16], m0
+    movups m0, [lineq+j1q-20]
+    shufps m0, m0, q0123
+    movups [lineq+j1q], m0
+
+    movups m3, [F_LFTG_DELTA]
+    mov   j0q, i0q
+    mov   j1q, i1q
+    shr   j0q, 1
+    sub   j0q, 4
+    shr   j1q, 1
+    add   j1q, 8
+    cmp   j0q, j1q
+    jge %%beginloop2
+%%loop1:
+    add   j0q, 12
+    cmp   j0q, j1q
+    jge %%endloop1
+ 
+    ;line{2*i,2*(i+1),2*(i+2),2*(i+3)} -= F_LFTG_DELTA*(line{2*i-1,2*(i+1)-1,2*(i+2)-1,2*(i+3)-1}+line{2*i+1,2*(i+1)+1,2*(i+2)+1,2*(i+3)+1})
+    movups m0, [lineq+2*j0q-28]
+    movups m4, [lineq+2*j0q-12]
+    movaps m1, m0
+    shufps m0, m4, q3131
+    shufps m1, m4, q2020
+    movups m2, [lineq+2*j0q-24]
+    movups m5, [lineq+2*j0q-8] 
+    shufps m2, m5, q3131
+    addps  m2, m1
+    mulps  m2, m3
+    subps  m0, m2
+    movaps m4, m1
+    movlhps m1, m0
+    shufps m1, m1, q3120
+    shufps m4, m0, q3232
+    shufps m4, m4, q3120
+    movups [lineq+2*j0q-28], m1
+    movups [lineq+2*j0q-12], m4
+
+    add   j0q, 4
+    cmp   j0q, j1q
+    jge %%beginloop2
+    jmp %%loop1
+  
+%%endloop1:
+    sub   j0q, 12
+%%littleloop1:
+    movss  m0, [lineq+2*j0q]
+    movss  m1, [lineq+2*j0q-4]
+    movss  m2, [lineq+2*j0q+4]
+    addss  m1, m2
+    mulss  m1, m3
+    subss  m0, m1
+    movss [lineq+2*j0q], m0
+    add   j0q, 4
+    cmp   j0q, j1q
+    jl %%littleloop1
+
+%%beginloop2:
+    movups m3, [F_LFTG_GAMMA]
+    mov   j0q, i0q
+    mov   j1q, i1q
+    shr   j0q, 1
+    sub   j0q, 4
+    shr   j1q, 1
+    add   j1q, 4
+    cmp   j0q, j1q
+    jge %%beginloop3
+%%loop2:
+    add   j0q, 12
+    cmp   j0q, j1q
+    jge %%endloop2
+ 
+    ;line{2*i+1,2*(i+1)+1,2*(i+2)+1,2*(i+3)+1} -= F_LFTG_GAMMA*(line{2*i,2*(i+1),2*(i+2),2*(i+3)}+line{2*i+2,2*(i+1)+2,2*(i+2)+2,2*(i+3)+2})
+    movups m0, [lineq+2*j0q-24]
+    movups m4, [lineq+2*j0q-8]
+    movaps m1, m0
+    shufps m0, m4, q3131
+    shufps m1, m4, q2020
+    movups m2, [lineq+2*j0q-20]
+    movups m5, [lineq+2*j0q-4] 
+    shufps m2, m5, q3131
+    addps  m2, m1
+    mulps  m2, m3
+    subps  m0, m2
+    movaps m4, m1
+    movlhps m1, m0
+    shufps m1, m1, q3120
+    shufps m4, m0, q3232
+    shufps m4, m4, q3120
+    movups [lineq+2*j0q-24], m1
+    movups [lineq+2*j0q-8], m4
+
+    add   j0q, 4
+    cmp   j0q, j1q
+    jge %%beginloop3
+    jmp %%loop2
+  
+%%endloop2:
+    sub   j0q, 12
+%%littleloop2:
+    movss  m0, [lineq+2*j0q+4]
+    movss  m1, [lineq+2*j0q]
+    movss  m2, [lineq+2*j0q+8]
+    addss  m1, m2
+    mulss  m1, m3
+    subss  m0, m1
+    movss  [lineq+2*j0q+4], m0
+    add   j0q, 4
+    cmp   j0q, j1q
+    jl %%littleloop2
+
+%%beginloop3:
+    movups m3, [F_LFTG_BETA]
+    mov   j0q, i0q
+    mov   j1q, i1q
+    shr   j0q, 1
+    sub   j0q, 4
+    shr   j1q, 1
+    add   j1q, 8
+    cmp   j0q, j1q
+    jge %%beginloop4
+%%loop3:
+    add   j0q, 12
+    cmp   j0q, j1q
+    jge %%endloop3
+ 
+    ;line{2*i,2*(i+1),2*(i+2),2*(i+3)} += F_LFTG_BETA*(line{2*i-1,2*(i+1)-1,2*(i+2)-1,2*(i+3)-1}+line{2*i+1,2*(i+1)+1,2*(i+2)+1,2*(i+3)+1})
+    movups m0, [lineq+2*j0q-28]
+    movups m4, [lineq+2*j0q-12]
+    movaps m1, m0
+    shufps m0, m4, q3131
+    shufps m1, m4, q2020
+    movups m2, [lineq+2*j0q-24]
+    movups m5, [lineq+2*j0q-8] 
+    shufps m2, m5, q3131
+    addps  m2, m1
+    mulps  m2, m3
+    addps  m0, m2
+    movaps m4, m1
+    movlhps m1, m0
+    shufps m1, m1, q3120
+    shufps m4, m0, q3232
+    shufps m4, m4, q3120
+    movups [lineq+2*j0q-28], m1
+    movups [lineq+2*j0q-12], m4
+
+    add   j0q, 4
+    cmp   j0q, j1q
+    jge %%beginloop4
+    jmp %%loop3
+  
+%%endloop3:
+    sub   j0q, 12
+%%littleloop3:
+    movss  m0, [lineq+2*j0q]
+    movss  m1, [lineq+2*j0q-4]
+    movss  m2, [lineq+2*j0q+4]
+    addss  m1, m2
+    mulss  m1, m3
+    addss  m0, m1
+    movss [lineq+2*j0q], m0
+    add   j0q, 4
+    cmp   j0q, j1q
+    jl %%littleloop3
+
+%%beginloop4:
+    movups m3, [F_LFTG_ALPHA]
+    mov   j0q, i0q
+    mov   j1q, i1q
+    shr   j0q, 1
+    sub   j0q, 4
+    shr   j1q, 1
+    add   j1q, 4
+    cmp   j0q, j1q
+    jge %%end
+%%loop4:
+    add   j0q, 12
+    cmp   j0q, j1q
+    jge %%endloop4
+ 
+    ;line{2*i+1,2*(i+1)+1,2*(i+2)+1,2*(i+3)+1} += F_LFTG_ALPHA*(line{2*i,2*(i+1),2*(i+2),2*(i+3)}+line{2*i+2,2*(i+1)+2,2*(i+2)+2,2*(i+3)+2})
+    movups m0, [lineq+2*j0q-24]
+    movups m4, [lineq+2*j0q-8]
+    movaps m1, m0
+    shufps m0, m4, q3131
+    shufps m1, m4, q2020
+    movups m2, [lineq+2*j0q-20]
+    movups m5, [lineq+2*j0q-4] 
+    shufps m2, m5, q3131
+    addps  m2, m1
+    mulps  m2, m3
+    addps  m0, m2
+    movaps m4, m1
+    movlhps m1, m0
+    shufps m1, m1, q3120
+    shufps m4, m0, q3232
+    shufps m4, m4, q3120
+    movups [lineq+2*j0q-24], m1
+    movups [lineq+2*j0q-8], m4
+
+    add   j0q, 4
+    cmp   j0q, j1q
+    jge %%end
+    jmp %%loop4
+  
+%%endloop4:
+    sub   j0q, 12
+%%littleloop4:
+    movss  m0, [lineq+2*j0q+4]
+    movss  m1, [lineq+2*j0q]
+    movss  m2, [lineq+2*j0q+8]
+    addss  m1, m2
+    mulss  m1, m3
+    addss  m0, m1
+    movss [lineq+2*j0q+4], m0
+    add   j0q, 4
+    cmp   j0q, j1q
+    jl %%littleloop4
+
+%%end:
+    REP_RET
+%endmacro
+    
+INIT_XMM sse
+SR1D97FLOAT 6
+
+%macro SR1D97FLOAT_ 5      ; p, i0, i1, tmp0, tmp1
+    mov    %4, %2
+    mov    %5, %3
+    add    %4, 1
+    cmp    %5, %4
+    jg %%extend
+    sub    %4, 2
+    jnz %%else
+    movss  m0, [%1+4]
+    movss  m1, [F_LFTG_K]
+    movss  m2, [TWO]
+    divss  m1, m2
+    mulss  m0, m1
+    movss  [%1+4], m0
+    jmp %%end
+
+%%else:
+    movss  m0, [%1]
+    movss  m1, [F_LFTG_X]
+    mulss  m0, m1
+    movss  [%1], m0
+    jmp %%end
+
+%%extend:
+    shl    %2, 2
+    shl    %3, 2
+    mov    %4, %2
+    mov    %5, %3
+    movups m0, [%1+%4+4]
+    shufps m0, m0, q0123
+    movups [%1+%4-16], m0
+    movups m0, [%1+%5-20]
+    shufps m0, m0, q0123
+    movups [%1+%5], m0
+
+    movups m3, [F_LFTG_DELTA]
+    mov    %4, %2
+    mov    %5, %3
+    shr    %4, 1
+    sub    %4, 4
+    shr    %5, 1
+    add    %5, 8
+    cmp    %4, %5
+    jge %%beginloop2
+%%loop1:
+    add    %4, 12
+    cmp    %4, %5
+    jge %%endloop1
+ 
+    movups m0, [%1+2*%4-28]
+    movups m4, [%1+2*%4-12]
+    movaps m1, m0
+    shufps m0, m4, q3131
+    shufps m1, m4, q2020
+    movups m2, [%1+2*%4-24]
+    movups m5, [%1+2*%4-8] 
+    shufps m2, m5, q3131
+    addps  m2, m1
+    mulps  m2, m3
+    subps  m0, m2
+    movaps m4, m1
+    movlhps m1, m0
+    shufps m1, m1, q3120
+    shufps m4, m0, q3232
+    shufps m4, m4, q3120
+    movups [%1+2*%4-28], m1
+    movups [%1+2*%4-12], m4
+
+    add    %4, 4
+    cmp    %4, %5
+    jge %%beginloop2
+    jmp %%loop1
+  
+%%endloop1:
+    sub    %4, 12
+%%littleloop1:
+    movss  m0, [%1+2*%4]
+    movss  m1, [%1+2*%4-4]
+    movss  m2, [%1+2*%4+4]
+    addss  m1, m2
+    mulss  m1, m3
+    subss  m0, m1
+    movss  [%1+2*%4], m0
+    add    %4, 4
+    cmp    %4, %5
+    jl %%littleloop1
+
+%%beginloop2:
+    movups m3, [F_LFTG_GAMMA]
+    mov    %4, %2
+    mov    %5, %3
+    shr    %4, 1
+    sub    %4, 4
+    shr    %5, 1
+    add    %5, 4
+    cmp    %4, %5
+    jge %%beginloop3
+%%loop2:
+    add    %4, 12
+    cmp    %4, %5
+    jge %%endloop2
+ 
+    movups m0, [%1+2*%4-24]
+    movups m4, [%1+2*%4-8]
+    movaps m1, m0
+    shufps m0, m4, q3131
+    shufps m1, m4, q2020
+    movups m2, [%1+2*%4-20]
+    movups m5, [%1+2*%4-4] 
+    shufps m2, m5, q3131
+    addps  m2, m1
+    mulps  m2, m3
+    subps  m0, m2
+    movaps m4, m1
+    movlhps m1, m0
+    shufps m1, m1, q3120
+    shufps m4, m0, q3232
+    shufps m4, m4, q3120
+    movups [%1+2*%4-24], m1
+    movups [%1+2*%4-8], m4
+
+    add    %4, 4
+    cmp    %4, %5
+    jge %%beginloop3
+    jmp %%loop2
+  
+%%endloop2:
+    sub    %4, 12
+%%littleloop2:
+    movss  m0, [%1+2*%4+4]
+    movss  m1, [%1+2*%4]
+    movss  m2, [%1+2*%4+8]
+    addss  m1, m2
+    mulss  m1, m3
+    subss  m0, m1
+    movss  [%1+2*%4+4], m0
+    add    %4, 4
+    cmp    %4, %5
+    jl %%littleloop2
+
+%%beginloop3:
+    movups m3, [F_LFTG_BETA]
+    mov    %4, %2
+    mov    %5, %3
+    shr    %4, 1
+    sub    %4, 4
+    shr    %5, 1
+    add    %5, 8
+    cmp    %4, %5
+    jge %%beginloop4
+%%loop3:
+    add    %4, 12
+    cmp    %4, %5
+    jge %%endloop3
+
+    movups m0, [%1+2*%4-28]
+    movups m4, [%1+2*%4-12]
+    movaps m1, m0
+    shufps m0, m4, q3131
+    shufps m1, m4, q2020
+    movups m2, [%1+2*%4-24]
+    movups m5, [%1+2*%4-8] 
+    shufps m2, m5, q3131
+    addps  m2, m1
+    mulps  m2, m3
+    addps  m0, m2
+    movaps m4, m1
+    movlhps m1, m0
+    shufps m1, m1, q3120
+    shufps m4, m0, q3232
+    shufps m4, m4, q3120
+    movups [%1+2*%4-28], m1
+    movups [%1+2*%4-12], m4
+
+    add    %4, 4
+    cmp    %4, %5
+    jge %%beginloop4
+    jmp %%loop3
+  
+%%endloop3:
+    sub    %4, 12
+%%littleloop3:
+    movss  m0, [%1+2*%4]
+    movss  m1, [%1+2*%4-4]
+    movss  m2, [%1+2*%4+4]
+    addss  m1, m2
+    mulss  m1, m3
+    addss  m0, m1
+    movss  [%1+2*%4], m0
+    add    %4, 4
+    cmp    %4, %5
+    jl %%littleloop3
+
+%%beginloop4:
+    movups m3, [F_LFTG_ALPHA]
+    mov    %4, %2
+    mov    %5, %3
+    shr    %4, 1
+    sub    %4, 4
+    shr    %5, 1
+    add    %5, 4
+    cmp    %4, %5
+    jge %%end
+%%loop4:
+    add    %4, 12
+    cmp    %4, %5
+    jge %%endloop4
+ 
+    movups m0, [%1+2*%4-24]
+    movups m4, [%1+2*%4-8]
+    movaps m1, m0
+    shufps m0, m4, q3131
+    shufps m1, m4, q2020
+    movups m2, [%1+2*%4-20]
+    movups m5, [%1+2*%4-4] 
+    shufps m2, m5, q3131
+    addps  m2, m1
+    mulps  m2, m3
+    addps  m0, m2
+    movaps m4, m1
+    movlhps m1, m0
+    shufps m1, m1, q3120
+    shufps m4, m0, q3232
+    shufps m4, m4, q3120
+    movups [%1+2*%4-24], m1
+    movups [%1+2*%4-8], m4
+
+    add    %4, 4
+    cmp    %4, %5
+    jge %%end
+    jmp %%loop4
+  
+%%endloop4:
+    sub    %4, 12
+%%littleloop4:
+    movss  m0, [%1+2*%4+4]
+    movss  m1, [%1+2*%4]
+    movss  m2, [%1+2*%4+8]
+    addss  m1, m2
+    mulss  m1, m3
+    addss  m0, m1
+    movss [%1+2*%4+4], m0
+    add    %4, 4
+    cmp    %4, %5
+    jl %%littleloop4
+
+%%end:
+    shr    %2, 2
+    shr    %3, 2
+%endmacro
+
+
+;***********************************************************************
+; ff_hor_sd_float_<opt>(float *line, float *data, int mh, int lh, int lv, int w)
+;***********************************************************************
+%macro HORSDFLOAT 1
+cglobal hor_sd_float, 6, 12, %1, line, data, mh, lh, lv, w, l, lp, i0, i1, j0, j1
+    mov    lq, mhq
+    shl    lq, 2
+    add    lq, lineq
+    shl   lhq, 2
+    
+    mov   lpq, 0
+%%mainloop:
+    ;j0 = w*lp+j
+    mov   j0q, wq
+    imul  j0q, lpq
+
+    ;j1 = (lh-mh+1)/2 + j0
+    mov   j1q, lhq
+    shr   j1q, 2
+    sub   j1q, mhq
+    add   j1q, 1
+    shr   j1q, 1
+    add   j1q, j0q
+
+    shl   j0q, 2
+    shl   j1q, 2
+
+    ;i1 = 1-mh
+    mov   i1q, 1
+    sub   i1q, mhq
+    shl   i1q, 2
+
+    ;i0 = mh
+    mov   i0q, mhq
+    shl   i0q, 2
+ 
+    cmp   i0q, i1q
+    jg %%i1i0
+
+;i0 < i1
+    cmp   i1q, lhq
+    jge %%i0
+    
+    add   i0q, 4
+    cmp   i0q, i1q
+    jne %%inci0
+ 
+;i1 = i0+1
+%%beginloopi0i1   
+    sub   i0q, 4
+
+%%loopi0i1:
+    add   i1q, 24
+    cmp   i1q, lhq
+    jge %%endloopi0i1
+
+    ;l{i0,i0+2,i0+4,i0+6} <- data[j0:j0+3]
+    ;l{i0,i0+3,i0+5,i0+7} = l{i1,i1+2,i1+4,i1+6} <- data[j1:j1+3]
+    movups m0, [dataq+j0q]
+    movups m2, [dataq+j1q]
+    movaps m1, m0
+    movlhps m0, m2
+    shufps m0, m0, q3120
+    shufps m1, m2, q3232
+    shufps m1, m1, q3120
+    movups [lq+i0q], m0
+    movups [lq+i0q+16], m1
+
+    add   i1q, 8
+    add   i0q, 32
+    add   j0q, 16
+    add   j1q, 16
+    cmp   i1q, lhq
+    jl %%loopi0i1  
+    cmp   i0q, lhq
+    jge %%sr_1d
+
+    ;i1>=lh & i0<lh
+    movss m0, [dataq+j0q]
+    movss [lq+i0q], m0
+    jmp %%sr_1d
+
+;i1 + 6 >= lh
+%%endloopi0i1:
+    sub   i1q, 24
+%%littleloopi0i1:
+
+    ;l[i0] <- data[j0]
+    ;l[i1] <- data[j1]
+    movss m0, [dataq+j0q]
+    movss m1, [dataq+j1q]
+    movss [lq+i0q], m0
+    movss [lq+i1q], m1
+
+    add   i0q, 8
+    add   i1q, 8
+    add   j0q, 4
+    add   j1q, 4
+    cmp   i1q, lhq
+    jl %%littleloopi0i1
+    cmp   i0q, lhq
+    jge %%sr_1d
+
+    ;i1>=lh & i0<lh
+    movss  m0, [dataq+j0q]
+    movss  [lq+i0q], m0
+    jmp %%sr_1d
+
+;i1 < i0
+%%i1i0:
+    cmp   i0q, lhq
+    jge %%i1
+    
+    add   i1q, 4
+    cmp   i0q, i1q
+    jne %%inci1
+
+;i0 = i1+1
+%%beginloopi1i0    
+    sub   i1q, 4
+
+%%loopi1i0:
+    add   i0q, 24
+    cmp   i0q, lhq
+    jge %%endloopi1i0
+
+    ;l{i1,i1+2,i1+4,i1+6} <- data[j1:j1+3]
+    ;l{i1,i1+3,i1+5,i1+7} = l{i0,i0+2,i0+4,i0+6} <- data[j0:j0+3]
+    movups m0, [dataq+j1q]
+    movups m2, [dataq+j0q]
+    movaps m1, m0
+    movlhps m0, m2
+    shufps m0, m0, q3120
+    shufps m1, m2, q3232
+    shufps m1, m1, q3120
+    movups [lq+i1q], m0
+    movups [lq+i1q+16], m1
+
+    add   i0q, 8
+    add   i1q, 32
+    add   j0q, 16
+    add   j1q, 16
+    cmp   i0q, lhq
+    jl %%loopi1i0  
+    cmp   i1q, lhq
+    jge %%sr_1d
+
+    ;i0>=lh & i1<lh
+    movss  m0, [dataq+j1q]
+    movss  [lq+i1q], m0
+    jmp %%sr_1d
+
+%%endloopi1i0:
+    sub   i1q, 24
+%%littleloopi1i0:
+
+    ;l[i0] <- data[j0]
+    ;l[i1] <- data[j1]
+    movss  m0, [dataq+j1q]
+    movss  m1, [dataq+j0q]
+    movss  [lq+i1q], m0
+    movss  [lq+i0q], m1
+
+    add   i0q, 8
+    add   i1q, 8
+    add   j0q, 4
+    add   j1q, 4
+    cmp   i1q, lhq
+    jl %%littleloopi1i0
+    cmp   i0q, lhq
+    jge %%sr_1d
+
+    ;i0>=lh & i1<lh
+    movss  m0, [dataq+j0q]
+    movss  [lq+i0q], m0
+    jmp %%sr_1d
+
+;i0<i1 & i1>=lh
+%%i0:
+    cmp   i0q, lhq
+    jge %%sr_1d
+    movss  m0, [dataq+j0q]
+    movss  [lq+i0q], m0
+    add   i0q, 8
+    add   j0q, 4
+    jmp %%i0
+
+;i1<i0 & i0>=lh
+%%i1:
+    cmp   i1q, lhq
+    jge %%sr_1d
+    movss  m0, [dataq+j1q]
+    movss [lq+i1q], m0
+    add   i1q, 8
+    add   j1q, 4
+    jmp %%i1
+
+;i0 < i1-1
+%%inci0:
+    cmp   i0q, lhq
+    jge %%sr_1d
+    movss  m0, [dataq+j0q]
+    movss  [lq+i0q-4], m0
+    add   i0q, 8
+    add   j0q, 4
+    cmp   i0q, i1q
+    je %%beginloopi0i1
+    jmp %%inci0
+
+;i1 < i0-1
+%%inci1:
+    cmp   i1q, lhq
+    jge %%sr_1d
+    movss  m0, [dataq+j1q]
+    movss  [lq+i1q-4], m0
+    add   i1q, 8
+    add   j1q, 4
+    cmp   i0q, i1q
+    je %%beginloopi1i0
+    jmp %%inci1
+
+%%sr_1d:
+    mov   i0q, mhq
+    mov   i1q, lhq
+    shr   i1q, 2
+    add   i1q, mhq
+    SR1D97FLOAT_ lineq, i0q, i1q, j0q, j1q
+
+    mov   i0q, 0
+    cmp   i0q, lhq
+    jge %%endmainloop
+    mov   j0q, wq
+    imul  j0q, lpq
+    shl   j0q, 2
+%%subloop3:
+    add   i0q, 12
+    cmp   i0q, lhq
+    jge %%endsubloop3
+
+    movups  m0, [lq+i0q-12]
+    movups  [dataq+j0q], m0
+
+    add   i0q, 4
+    add   j0q, 16
+    cmp   i0q, lhq
+    jge %%endmainloop
+    jmp %%subloop3  
+
+%%endsubloop3:
+    sub   i0q, 12
+%%littlesubloop3:
+    movss  m0, [lq+i0q]
+    movss  [dataq+j0q], m0
+
+    add   i0q, 4
+    add   j0q, 4
+    cmp   i0q, lhq
+    jl %%littlesubloop3  
+
+%%endmainloop:
+    add   lpq, 1
+    cmp   lpq, lvq
+    jl %%mainloop
+
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+HORSDFLOAT 6
+
+;***********************************************************************
+; ff_ver_sd_float_<opt>(float *line, float *data, int mv, int lv, int lh, int w)
+;***********************************************************************
+%macro VERSDFLOAT 1
+cglobal ver_sd_float, 6, 12, %1, line, data, mv, lh, lv, w, lp, i0, i1, j0, j1, inc
+    shl   mvq, 2
+    add   lineq, mvq
+    mov   incq, lvq
+    add   incq, 12
+    shl   incq, 2
+    shl   lvq, 2
+    shl   wq, 2    
+    
+    mov   lpq, 0
+
+%%mainloop:
+    ;j0 = w*j+lp
+    mov   j0q, lpq
+
+    add   lpq, 3
+    cmp   lpq, lhq
+    jge %%beginmainloop2
+
+    shr   lvq, 2
+    shr   wq, 2
+    ;j1 = w*(lv-mv+1)/2 + j0
+    mov   j1q, lvq
+    sub   j1q, mvq
+    add   j1q, 1
+    shr   j1q, 1
+    imul  j1q, wq
+    add   j1q, j0q
+
+    shl   lvq, 2
+    shl   wq, 2
+    shl   j1q, 2
+    shl   j0q, 2
+
+    ;i1 = 1-mv
+    mov   i1q, 4
+    sub   i1q, mvq
+
+    ;i0 = mv
+    mov   i0q, mvq
+ 
+    cmp   i0q, i1q
+    jg %%i1i0
+
+;i0 < i1
+    cmp   i1q, lvq
+    jge %%i0
+  
+    add   i0q, 4
+    cmp   i0q, i1q
+    jne %%inci0
+ 
+;i1 = i0+1
+%%beginloopi0i1   
+    sub   i0q, 4
+
+%%loopi0i1:
+;    add   i1q, 12
+;    cmp   i1q, lvq
+;    jge %%endloopi0i1
+    
+;    movlps  m0, [dataq+j0q]
+;    movhps  m0, [dataq+j1q]
+;    movlps  m1, [dataq+j0q+8]
+;    movhps  m1, [dataq+j1q+8]
+;    add     j0q, wq
+;    add     j1q, wq
+;    movlps  m2, [dataq+j0q]
+;    movhps  m2, [dataq+j1q]
+;    movlps  m3, [dataq+j0q+8]
+;    movhps  m3, [dataq+j1q+8]
+;    movaps  m4, m0
+;    shufps  m0, m2, q2020
+;    shufps  m4, m2, q3131
+;    movaps  m2, m4
+;    movaps  m4, m1
+;    shufps  m1, m3, q2020
+;    shufps  m4, m3, q3131
+;    movaps  m3, m4
+;    movups  [lineq+i0q], m0
+;    add     lineq, incq
+;    movups  [lineq+i0q], m2
+;    add     lineq, incq
+;    movups  [lineq+i0q], m1
+;    add     lineq, incq
+;    movups  [lineq+i0q], m3
+;    sub     lineq, incq
+;    sub     lineq, incq
+;    sub     lineq, incq
+
+;    add     i1q, 4
+;    add     i0q, 16
+;    add     j0q, wq
+;    add     j1q, wq
+;    cmp     i1q, lvq
+;    jl %%loopi0i1 
+;    cmp     i0q, lvq
+;    jl %%lasti0 
+;    jmp %%sr_1d
+
+;i1 + 3 >= lv
+%%endloopi0i1:
+;    sub     i1q, 12
+%%littleloopi0i1:
+
+    movss   m0, [dataq+j0q]
+    movss   m1, [dataq+j1q]
+    movss   [lineq+i0q], m0
+    movss   [lineq+i0q+4], m1
+    movss   m0, [dataq+j0q+4]
+    movss   m1, [dataq+j1q+4]
+    add     lineq, incq
+    movss   [lineq+i0q], m0
+    movss   [lineq+i0q+4], m1
+    movss   m0, [dataq+j0q+8]
+    movss   m1, [dataq+j1q+8]
+    add     lineq, incq
+    movss   [lineq+i0q], m0
+    movss   [lineq+i0q+4], m1
+    movss   m0, [dataq+j0q+12]
+    movss   m1, [dataq+j1q+12]
+    add     lineq, incq
+    movss   [lineq+i0q], m0
+    movss   [lineq+i0q+4], m1
+    sub     lineq, incq
+    sub     lineq, incq
+    sub     lineq, incq
+
+    add     i1q, 8
+    add     i0q, 8
+    add     j0q, wq
+    add     j1q, wq
+    cmp     i1q, lvq
+    jl %%littleloopi0i1
+    cmp     i0q, lvq
+    jge %%sr_1d
+
+%%lasti0:
+    movss   m0, [dataq+j0q]
+    movss   [lineq+i0q], m0
+    add     lineq, incq
+    movss   m0, [dataq+j0q+4]
+    movss   [lineq+i0q], m0
+    add     lineq, incq
+    movss   m0, [dataq+j0q+8]
+    movss   [lineq+i0q], m0
+    add     lineq, incq
+    movss   m0, [dataq+j0q+12]
+    movss   [lineq+i0q], m0
+    sub     lineq, incq
+    sub     lineq, incq
+    sub     lineq, incq
+    jmp %%sr_1d
+
+;i1 < i0
+%%i1i0:
+    cmp     i0q, lvq
+    jge %%i1
+    
+    add     i1q, 4
+    cmp     i0q, i1q
+    jne %%inci1
+
+;i0 = i1+1
+%%beginloopi1i0    
+    sub     i1q, 4
+
+%%loopi1i0:
+;    add   i0q, 12
+;    cmp   i0q, lvq
+;    jge %%endloopi0i1
+    
+;    movlps  m0, [dataq+j1q]
+;    movhps  m0, [dataq+j0q]
+;    movlps  m1, [dataq+j1q+8]
+;    movhps  m1, [dataq+j0q+8]
+;    add     j0q, wq
+;    add     j1q, wq
+;    movlps  m2, [dataq+j1q]
+;    movhps  m2, [dataq+j0q]
+;    movlps  m3, [dataq+j1q+8]
+;    movhps  m3, [dataq+j0q+8]
+;    movaps  m4, m0
+;    shufps  m0, m2, q2020
+;    shufps  m4, m2, q3131
+;    movaps  m2, m4
+;    movaps  m4, m1
+;    shufps  m1, m3, q2020
+;    shufps  m4, m3, q3131
+;    movaps  m3, m4
+;    movups  [lineq+i1q], m0
+;    add     lineq, incq
+;    movups  [lineq+i1q], m2
+;    add     lineq, incq
+;    movups  [lineq+i1q], m1
+;    add     lineq, incq
+;    movups  [lineq+i1q], m3
+;    sub     lineq, incq
+;    sub     lineq, incq
+;    sub     lineq, incq
+
+;    add     i0q, 4
+;    add     i1q, 16
+;    add     j1q, wq
+;    add     j0q, wq
+;    cmp     i0q, lvq
+;    jl %%loopi1i0 
+;    cmp     i1q, lvq
+;    jl %%lasti1 
+;    jmp %%sr_1d
+
+%%endloopi1i0:
+;    sub     i1q, 12
+%%littleloopi1i0:
+
+    movss   m0, [dataq+j1q]
+    movss   m1, [dataq+j0q]
+    movss   [lineq+i1q], m0
+    movss   [lineq+i1q+4], m1
+    movss   m0, [dataq+j1q+4]
+    movss   m1, [dataq+j0q+4]
+    add     lineq, incq
+    movss   [lineq+i1q], m0
+    movss   [lineq+i1q+4], m1
+    movss   m0, [dataq+j1q+8]
+    movss   m1, [dataq+j0q+8]
+    add     lineq, incq
+    movss   [lineq+i1q], m0
+    movss   [lineq+i1q+4], m1
+    movss   m0, [dataq+j1q+12]
+    movss   m1, [dataq+j0q+12]
+    add     lineq, incq
+    movss   [lineq+i1q], m0
+    movss   [lineq+i1q+4], m1
+    sub     lineq, incq
+    sub     lineq, incq
+    sub     lineq, incq
+
+    add     i1q, 8
+    add     i0q, 8
+    add     j0q, wq
+    add     j1q, wq
+    cmp     i0q, lvq
+    jl %%littleloopi1i0
+    cmp     i1q, lvq
+    jge %%sr_1d
+
+%%lasti1:
+    movss   m0, [dataq+j1q]
+    movss   [lineq+i1q], m0
+    add     lineq, incq
+    movss   m0, [dataq+j1q+4]
+    movss   [lineq+i1q], m0
+    add     lineq, incq
+    movss   m0, [dataq+j1q+8]
+    movss   [lineq+i1q], m0
+    add     lineq, incq
+    movss   m0, [dataq+j1q+12]
+    movss   [lineq+i1q], m0
+    sub     lineq, incq
+    sub     lineq, incq
+    sub     lineq, incq
+    jmp %%sr_1d
+
+;i0<i1 & i1>=lv
+%%i0:
+    cmp     i0q, lvq
+    jge %%sr_1d
+    movss   m0, [dataq+j0q]
+    movss   [lineq+i0q], m0
+    add     lineq, incq
+    movss   m0, [dataq+j0q+4]
+    movss   [lineq+i0q], m0
+    add     lineq, incq
+    movss   m0, [dataq+j0q+8]
+    movss   [lineq+i0q], m0
+    add     lineq, incq
+    movss   m0, [dataq+j0q+12]
+    movss   [lineq+i0q], m0
+    sub     lineq, incq
+    sub     lineq, incq
+    sub     lineq, incq
+    add     i0q, 8
+    add     j0q, wq
+    jmp %%i0
+
+;i1<i0 & i0>=lh
+%%i1:
+    cmp     i1q, lvq
+    jge %%sr_1d
+    movss   m0, [dataq+j1q]
+    movss   [lineq+i1q], m0
+    add     lineq, incq
+    movss   m0, [dataq+j1q+4]
+    movss   [lineq+i1q], m0
+    add     lineq, incq
+    movss   m0, [dataq+j1q+8]
+    movss   [lineq+i1q], m0
+    add     lineq, incq
+    movss   m0, [dataq+j1q+12]
+    movss   [lineq+i1q], m0
+    sub     lineq, incq
+    sub     lineq, incq
+    sub     lineq, incq
+    add     i1q, 8
+    add     j1q, wq
+    jmp %%i1
+
+;i0 < i1-1
+%%inci0:
+    cmp     i0q, lvq
+    jge %%sr_1d
+    movss   m0, [dataq+j0q]
+    movss   [lineq+i0q-4], m0
+    add     lineq, incq
+    movss   m0, [dataq+j0q+4]
+    movss   [lineq+i0q-4], m0
+    add     lineq, incq
+    movss   m0, [dataq+j0q+8]
+    movss   [lineq+i0q-4], m0
+    add     lineq, incq
+    movss   m0, [dataq+j0q+12]
+    movss   [lineq+i0q-4], m0
+    sub     lineq, incq
+    sub     lineq, incq
+    sub     lineq, incq
+    add     i0q, 8
+    add     j0q, wq
+    cmp     i0q, i1q
+    je %%beginloopi0i1
+    jmp %%inci0
+
+;i1 < i0-1
+%%inci1:
+    cmp     i1q, lvq
+    jge %%sr_1d
+    movss   m0, [dataq+j1q]
+    movss   [lineq+i1q-4], m0
+    add     lineq, incq
+    movss   m0, [dataq+j1q+4]
+    movss   [lineq+i1q-4], m0
+    add     lineq, incq
+    movss   m0, [dataq+j1q+8]
+    movss   [lineq+i1q-4], m0
+    add     lineq, incq
+    movss   m0, [dataq+j1q+12]
+    movss   [lineq+i1q-4], m0
+    sub     lineq, incq
+    sub     lineq, incq
+    sub     lineq, incq
+    add     i1q, 8
+    add     j1q, wq
+    cmp     i0q, i1q
+    je %%beginloopi1i0
+    jmp %%inci1
+
+%%sr_1d:
+    sub     lineq, mvq
+    mov     i0q, mvq
+    mov     i1q, lvq
+    add     i1q, mvq
+    shr     i0q, 2
+    shr     i1q, 2
+    SR1D97FLOAT_ lineq, i0q, i1q, j0q, j1q
+    add     lineq, incq
+    mov     i0q, mvq
+    mov     i1q, lvq
+    add     i1q, mvq
+    shr     i0q, 2
+    shr     i1q, 2
+    SR1D97FLOAT_ lineq, i0q, i1q, j0q, j1q
+    add     lineq, incq
+    mov     i0q, mvq
+    mov     i1q, lvq
+    add     i1q, mvq
+    shr     i0q, 2
+    shr     i1q, 2
+    SR1D97FLOAT_ lineq, i0q, i1q, j0q, j1q
+    add     lineq, incq
+    mov     i0q, mvq
+    mov     i1q, lvq
+    add     i1q, mvq
+    shr     i0q, 2
+    shr     i1q, 2
+    SR1D97FLOAT_ lineq, i0q, i1q, j0q, j1q
+    sub     lineq, incq
+    sub     lineq, incq
+    sub     lineq, incq
+    add     lineq, mvq
+
+    mov     i0q, 0
+    ;cmp     i0q, lvq
+    ;jge %%endmainloop
+    mov     j0q, lpq
+    sub     j0q, 3
+    shl     j0q, 2
+%%loop3:
+    add     i0q, 12
+    cmp     i0q, lvq
+    jge %%endloop3
+
+    movups  m0, [lineq+i0q-12]
+    add     lineq, incq
+    movups  m1, [lineq+i0q-12]
+    add     lineq, incq
+    movups  m2, [lineq+i0q-12]
+    add     lineq, incq
+    movups  m3, [lineq+i0q-12]
+    movaps  m4, m0
+    movaps  m5, m2
+    movlhps m0, m1
+    movlhps m2, m3
+    movaps  m6, m0
+    shufps  m0, m2, q2020
+    shufps  m6, m2, q3131
+    movaps  m2, m6
+    movhlps m1, m4
+    movhlps m3, m5
+    movaps  m6, m1
+    shufps  m1, m3, q2020
+    shufps  m6, m3, q3131
+    movaps  m3, m6
+    movups  [dataq+j0q], m0
+    add     j0q, wq
+    movups  [dataq+j0q], m2
+    add     j0q, wq
+    movups  [dataq+j0q], m1
+    add     j0q, wq
+    movups  [dataq+j0q], m3
+    add     j0q, wq
+    sub     lineq, incq
+    sub     lineq, incq
+    sub     lineq, incq
+
+    add     i0q, 4
+    cmp     i0q, lvq
+    jge %%endmainloop
+    jmp %%loop3  
+
+%%endloop3:
+    sub     i0q, 12
+
+%%littleloop3:
+    movss   m0, [lineq+i0q]
+    movss   [dataq+j0q], m0
+    add     lineq, incq
+    movss   m0, [lineq+i0q]
+    movss   [dataq+j0q+4], m0
+    add     lineq, incq
+    movss   m0, [lineq+i0q]
+    movss   [dataq+j0q+8], m0
+    add     lineq, incq
+    movss   m0, [lineq+i0q]
+    movss   [dataq+j0q+12], m0
+    sub     lineq, incq
+    sub     lineq, incq
+    sub     lineq, incq
+
+    add     i0q, 4
+    add     j0q, wq
+    cmp     i0q, lvq
+    jl %%littleloop3  
+
+%%endmainloop:
+    add     lpq, 1
+    cmp     lpq, lhq
+    jl %%mainloop
+    jmp %%end
+
+%%beginmainloop2:
+    sub     lpq, 3
+%%mainloop2:
+    ;j0 = w*j+lp
+    mov   j0q, lpq
+    shl   j0q, 2
+
+    ;i0 = mv
+    mov   i0q, mvq
+ 
+    cmp     i0q, lvq
+    jge %%beginloop5
+%%loop4:
+    movss   m0, [dataq+j0q]
+    movss   [lineq+i0q], m0
+
+    add     j0q, wq
+    add     i0q, 8
+    cmp     i0q, lvq
+    jl      %%loop4
+
+%%beginloop5:
+    ;i0 = 1-mv
+    mov   i0q, 4
+    sub   i0q, mvq
+    cmp     i0q, lvq
+    jge %%sr_1d_2
+%%loop5:
+    movss   m0, [dataq+j0q]
+    movss   [lineq+i0q], m0
+
+    add     j0q, wq
+    add     i0q, 8
+    cmp     i0q, lvq
+    jl      %%loop5
+
+%%sr_1d_2:
+    sub     lineq, mvq
+    mov     i0q, mvq
+    mov     i1q, lvq
+    add     i1q, mvq
+    shr     i1q, 2
+    shr     i0q, 2
+    SR1D97FLOAT_ lineq, i0q, i1q, j0q, j1q
+    add     lineq, mvq
+
+    mov     i0q, 0
+    cmp     i0q, lvq
+    jge %%endmainloop
+    mov     j0q, lpq
+    shl     j0q, 2
+%%loop6:
+    movss   m0, [lineq+i0q]
+    movss   [dataq+j0q], m0
+
+    add     j0q, wq
+    add     i0q, 4
+    cmp     i0q, lvq
+    jl %%loop6 
+
+%%endmainloop2:
+    add   lpq, 1
+    cmp   lpq, lhq
+    jl %%mainloop2
+
+%%end:
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+VERSDFLOAT 6
diff --git a/libavcodec/x86/jpeg2000dsp_init.c b/libavcodec/x86/jpeg2000dsp_init.c
index baa81383ea..04cd01379d 100644
--- a/libavcodec/x86/jpeg2000dsp_init.c
+++ b/libavcodec/x86/jpeg2000dsp_init.c
@@ -19,16 +19,23 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/jpeg2000dsp.h"
+#include "libavcodec/jpeg2000dwt.h"
+#include <stdio.h>
 
 void ff_ict_float_sse(void *src0, void *src1, void *src2, int csize);
 void ff_ict_float_avx(void *src0, void *src1, void *src2, int csize);
 void ff_rct_int_sse2 (void *src0, void *src1, void *src2, int csize);
 void ff_rct_int_avx2 (void *src0, void *src1, void *src2, int csize);
 
+void ff_sr_1d97_float_sse(float *line, int i0, int i1);
+void ff_hor_sd_float_sse(float *line, float *data, int mh, int lh, int lv, int w);
+void ff_ver_sd_float_sse(float *line, float *data, int mv, int lh, int lv, int w);
+
 av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -48,3 +55,115 @@ av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c)
         c->mct_decode[FF_DWT53] = ff_rct_int_avx2;
     }
 }
+
+av_cold void ff_jpeg2000dwt_init_x86(DWTContext *s, int type)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (EXTERNAL_SSE(cpu_flags)) {
+        if (type == FF_DWT97){
+            s->sse = 1;
+        }
+    }
+}
+
+void dwt_decode97_float_sse(DWTContext *s, float *t)
+{
+    int lev;
+    int w       = s->linelen[s->ndeclevels - 1][0];
+    float *line = s->f_linebuf;
+    float *data = t;
+    /* position at index O of line range [0-5,w+5] cf. extend function */
+    line += 5;
+    int len = s->linelen[s->ndeclevels - 1][0]*s->linelen[s->ndeclevels - 1][1];
+
+    int i, j = 0;
+
+    for (lev = 0; lev < s->ndeclevels; lev++) {
+        int lh = s->linelen[lev][0],
+            lv = s->linelen[lev][1],
+            mh = s->mod[lev][0],
+            mv = s->mod[lev][1],
+            lp;
+        float *l;
+        int *test = malloc(sizeof(int));
+
+        // HOR_SD
+        ff_hor_sd_float_sse(line, data, mh, lh, lv, w);
+
+        // VER_SD
+        ff_ver_sd_float_sse(line, data, mv, lh, lv, w);
+        /*l = line + mv;
+        inc = lv+16;
+        for (lp = 0; lp+3 < lh; lp += 4) {
+            //printf("hello \n");
+            j = 0;
+            // copy with interleaving
+            for (i = mv; i < lv; i += 2, j++){
+                l[i] = data[w * j + lp];
+                l += inc;
+                l[i] = data[w * j + lp + 1];
+                l += inc;
+                l[i] = data[w * j + lp + 2];
+                l += inc;
+                l[i] = data[w * j + lp + 3];
+                l -= inc;
+                l -= inc;
+                l -= inc;
+            }
+            for (i = 1 - mv; i < lv; i += 2, j++){
+                l[i] = data[w * j + lp];
+                l += inc;
+                l[i] = data[w * j + lp + 1];
+                l += inc;
+                l[i] = data[w * j + lp + 2];
+                l += inc;
+                l[i] = data[w * j + lp + 3];
+                l -= inc;
+                l -= inc;
+                l -= inc;
+            }
+
+            ff_sr_1d97_float_sse(l, mv, mv + lv);
+            l += inc;
+            ff_sr_1d97_float_sse(l, mv, mv + lv);
+            l += inc;
+            ff_sr_1d97_float_sse(l, mv, mv + lv);
+            l += inc;
+            ff_sr_1d97_float_sse(l, mv, mv + lv);
+            l -= inc;
+            l -= inc;
+            l -= inc;
+
+            for (i = 0; i < lv; i++){
+                data[w * i + lp] = l[i];
+                l += inc;
+                data[w * i + lp + 1] = l[i];
+                l += inc;
+                data[w * i + lp + 2] = l[i];
+                l += inc;
+                data[w * i + lp + 3] = l[i];
+                l -= inc;
+                l -= inc;
+                l -= inc;
+            }
+        }
+
+        for (; lp < lh; lp ++) {
+            //printf("hello \n");
+            j = 0;
+            // copy with interleaving
+            for (i = mv; i < lv; i += 2, j++){
+                l[i] = data[w * j + lp];
+            }
+            for (i = 1 - mv; i < lv; i += 2, j++){
+                l[i] = data[w * j + lp];
+            }
+
+            ff_sr_1d97_float_sse(line, mv, mv + lv);
+
+            for (i = 0; i < lv; i++){
+                data[w * i + lp] = l[i];
+            }
+        }*/
+    }
+}
diff --git a/tests/checkasm/jpeg2000dsp.c b/tests/checkasm/jpeg2000dsp.c
index 48559df085..92f3264674 100644
--- a/tests/checkasm/jpeg2000dsp.c
+++ b/tests/checkasm/jpeg2000dsp.c
@@ -20,6 +20,7 @@
 
 #include "checkasm.h"
 #include "libavcodec/jpeg2000dsp.h"
+#include "libavcodec/jpeg2000dwt.h"
 #include "libavutil/common.h"
 #include "libavutil/internal.h"
 #include "libavutil/intreadwrite.h"
-- 
2.11.0



More information about the ffmpeg-devel mailing list