[FFmpeg-devel] [PATCH] JPEG2000: SSE optimisation of DWT decoding

maxime taisant maximetaisant at hotmail.fr
Tue Aug 8 12:09:44 EEST 2017


From: Maxime Taisant <maximetaisant at hotmail.fr>

Hi,

Here is some SSE optimisations for the dwt function used to decode JPEG2000.
I tested this code by using the time command while reading a JPEG2000 encoded video with ffmpeg and, on average, I observed a 4.05% general improvement, and a 12.67% improvement on the dwt decoding part alone.
In the nasm code, you can notice that the SR1DFLOAT macro appear twice. One version is called in the nasm code by the HORSD macro and the other is called in the C code of the dwt function, I couldn't figure out a way to make only one macro.
I also couldn't figure out a good way to optimize the VER_SD part, so that is why I left it unchanged, with just a SSE-optimized version of the SR_1D_FLOAT function.

Regards.

---
 libavcodec/jpeg2000dwt.c          |  21 +-
 libavcodec/jpeg2000dwt.h          |   6 +
 libavcodec/x86/jpeg2000dsp.asm    | 794 ++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/jpeg2000dsp_init.c |  55 +++
 4 files changed, 863 insertions(+), 13 deletions(-)

diff --git a/libavcodec/jpeg2000dwt.c b/libavcodec/jpeg2000dwt.c
index 55dd5e89b5..69c935980d 100644
--- a/libavcodec/jpeg2000dwt.c
+++ b/libavcodec/jpeg2000dwt.c
@@ -558,16 +558,19 @@ int ff_jpeg2000_dwt_init(DWTContext *s, int border[2][2],
         }
     switch (type) {
     case FF_DWT97:
+        dwt_decode = dwt_decode97_float;
         s->f_linebuf = av_malloc_array((maxlen + 12), sizeof(*s->f_linebuf));
         if (!s->f_linebuf)
             return AVERROR(ENOMEM);
         break;
      case FF_DWT97_INT:
+        dwt_decode = dwt_decode97_int;
         s->i_linebuf = av_malloc_array((maxlen + 12), sizeof(*s->i_linebuf));
         if (!s->i_linebuf)
             return AVERROR(ENOMEM);
         break;
     case FF_DWT53:
+        dwt_decode = dwt_decode53;
         s->i_linebuf = av_malloc_array((maxlen +  6), sizeof(*s->i_linebuf));
         if (!s->i_linebuf)
             return AVERROR(ENOMEM);
@@ -575,6 +578,10 @@ int ff_jpeg2000_dwt_init(DWTContext *s, int border[2][2],
     default:
         return -1;
     }
+
+    if (ARCH_X86)
+        ff_jpeg2000dwt_init_x86(s, type);
+
     return 0;
 }
 
@@ -601,19 +608,7 @@ int ff_dwt_decode(DWTContext *s, void *t)
     if (s->ndeclevels == 0)
         return 0;
 
-    switch (s->type) {
-    case FF_DWT97:
-        dwt_decode97_float(s, t);
-        break;
-    case FF_DWT97_INT:
-        dwt_decode97_int(s, t);
-        break;
-    case FF_DWT53:
-        dwt_decode53(s, t);
-        break;
-    default:
-        return -1;
-    }
+    dwt_decode(s,t);
     return 0;
 }
 
diff --git a/libavcodec/jpeg2000dwt.h b/libavcodec/jpeg2000dwt.h
index 718d183ac1..8462ddf8cd 100644
--- a/libavcodec/jpeg2000dwt.h
+++ b/libavcodec/jpeg2000dwt.h
@@ -50,6 +50,8 @@ typedef struct DWTContext {
     float   *f_linebuf;                  ///< float buffer used by transform
 } DWTContext;
 
+void (*dwt_decode)(DWTContext *s, void *t);
+
 /**
  * Initialize DWT.
  * @param s                 DWT context
@@ -65,4 +67,8 @@ int ff_dwt_decode(DWTContext *s, void *t);
 
 void ff_dwt_destroy(DWTContext *s);
 
+void dwt_decode97_float_sse(DWTContext *s, float *t);
+
+void ff_jpeg2000dwt_init_x86(DWTContext *s, int type);
+
 #endif /* AVCODEC_JPEG2000DWT_H */
diff --git a/libavcodec/x86/jpeg2000dsp.asm b/libavcodec/x86/jpeg2000dsp.asm
index 56b5fbd606..262704f288 100644
--- a/libavcodec/x86/jpeg2000dsp.asm
+++ b/libavcodec/x86/jpeg2000dsp.asm
@@ -2,6 +2,7 @@
 ;* SIMD-optimized JPEG2000 DSP functions
 ;* Copyright (c) 2014 Nicolas Bertrand
 ;* Copyright (c) 2015 James Almer
+;* Copyright (c) 2017 Maxime Taisant
 ;*
 ;* This file is part of FFmpeg.
 ;*
@@ -29,6 +30,16 @@ pf_ict1: times 8 dd 0.34413
 pf_ict2: times 8 dd 0.71414
 pf_ict3: times 8 dd 1.772
 
+F_LFTG_K: dd 1.230174104914001
+F_LFTG_X: dd 0.812893066115961
+
+F_LFTG_ALPHA: times 8 dd 1.586134342059924
+F_LFTG_BETA: times 8 dd 0.052980118572961
+F_LFTG_GAMMA: times 8 dd 0.882911075530934
+F_LFTG_DELTA: times 8 dd 0.443506852043971
+
+TWO: dd 2.0
+
 SECTION .text
 
 ;***********************************************************************
@@ -142,3 +153,786 @@ RCT_INT
 INIT_YMM avx2
 RCT_INT
 %endif
+
+;***********************************************************************
+; ff_sr_ld97_float_<opt>(float *line, int i0, int i1)
+;***********************************************************************
+%macro SR1D97FLOAT 1
+cglobal sr_1d97_float, 3, 5, %1, line, i0, i1, j0, j1
+    mov j0q, i0q
+    mov j1q, i1q
+    add j0q, 1
+    cmp j1q, j0q
+    jg .extend
+    sub j0q, 2
+    jnz .else
+    movss m0, [lineq+4]
+    movss m1, [F_LFTG_K]
+    movss m2, [TWO]
+    divss m1, m2
+    mulss m0, m1
+    movss [lineq+4], m0
+    jmp .end
+
+.else:
+    movss m0, [lineq]
+    movss m1, [F_LFTG_X]
+    mulss m0, m1
+    movss [lineq], m0
+    jmp .end
+
+.extend:
+    shl i0d, 2
+    shl i1d, 2
+    mov j0q, i0q
+    mov j1q, i1q
+    movups m0, [lineq+j0q+4]
+    shufps m0, m0, 0x1B
+    movups [lineq+j0q-16], m0
+    movups m0, [lineq+j1q-20]
+    shufps m0, m0, 0x1B
+    movups [lineq+j1q], m0
+
+    movups m3, [F_LFTG_DELTA]
+    mov j0q, i0q
+    mov j1q, i1q
+    shr j0q, 1
+    sub j0q, 4
+    shr j1q, 1
+    add j1q, 8
+    cmp j0q, j1q
+    jge .beginloop2
+.loop1:
+    add j0q, 12
+    cmp j0q, j1q
+    jge .endloop1
+ 
+    ;line{2*i,2*(i+1),2*(i+2),2*(i+3)} -= F_LFTG_DELTA*(line{2*i-1,2*(i+1)-1,2*(i+2)-1,2*(i+3)-1}+line{2*i+1,2*(i+1)+1,2*(i+2)+1,2*(i+3)+1})
+    movups m0, [lineq+2*j0q-28]
+    movups m4, [lineq+2*j0q-12]
+    movups m1, m0
+    shufps m0, m4, 0xDD
+    shufps m1, m4, 0x88
+    movups m2, [lineq+2*j0q-24]
+    movups m5, [lineq+2*j0q-8] 
+    shufps m2, m5, 0xDD
+    addps m2, m1
+    mulps m2, m3
+    subps m0, m2
+    movups m4, m1
+    shufps m1, m0, 0x44
+    shufps m1, m1, 0xD8
+    shufps m4, m0, 0xEE
+    shufps m4, m4, 0xD8
+    movups [lineq+2*j0q-28], m1
+    movups [lineq+2*j0q-12], m4
+
+    add j0q, 4
+    cmp j0q, j1q
+    jge .beginloop2
+    jmp .loop1
+  
+.endloop1:
+    sub j0q, 12
+.littleloop1:
+    movss m0, [lineq+2*j0q]
+    movss m1, [lineq+2*j0q-4]
+    movss m2, [lineq+2*j0q+4]
+    addss m1, m2
+    mulss m1, m3
+    subss m0, m1
+    movss [lineq+2*j0q], m0
+    add j0q, 4
+    cmp j0q, j1q
+    jl .littleloop1
+
+.beginloop2:
+    movups m3, [F_LFTG_GAMMA]
+    mov j0q, i0q
+    mov j1q, i1q
+    shr j0q, 1
+    sub j0q, 4
+    shr j1q, 1
+    add j1q, 4
+    cmp j0q, j1q
+    jge .beginloop3
+.loop2:
+    add j0q, 12
+    cmp j0q, j1q
+    jge .endloop2
+ 
+    ;line{2*i+1,2*(i+1)+1,2*(i+2)+1,2*(i+3)+1} -= F_LFTG_GAMMA*(line{2*i,2*(i+1),2*(i+2),2*(i+3)}+line{2*i+2,2*(i+1)+2,2*(i+2)+2,2*(i+3)+2})
+    movups m0, [lineq+2*j0q-24]
+    movups m4, [lineq+2*j0q-8]
+    movups m1, m0
+    shufps m0, m4, 0xDD
+    shufps m1, m4, 0x88
+    movups m2, [lineq+2*j0q-20]
+    movups m5, [lineq+2*j0q-4] 
+    shufps m2, m5, 0xDD
+    addps m2, m1
+    mulps m2, m3
+    subps m0, m2
+    movups m4, m1
+    shufps m1, m0, 0x44
+    shufps m1, m1, 0xD8
+    shufps m4, m0, 0xEE
+    shufps m4, m4, 0xD8
+    movups [lineq+2*j0q-24], m1
+    movups [lineq+2*j0q-8], m4
+
+    add j0q, 4
+    cmp j0q, j1q
+    jge .beginloop3
+    jmp .loop2
+  
+.endloop2:
+    sub j0q, 12
+.littleloop2:
+    movss m0, [lineq+2*j0q+4]
+    movss m1, [lineq+2*j0q]
+    movss m2, [lineq+2*j0q+8]
+    addss m1, m2
+    mulss m1, m3
+    subss m0, m1
+    movss [lineq+2*j0q+4], m0
+    add j0q, 4
+    cmp j0q, j1q
+    jl .littleloop2
+
+.beginloop3:
+    movups m3, [F_LFTG_BETA]
+    mov j0q, i0q
+    mov j1q, i1q
+    shr j0q, 1
+    sub j0q, 4
+    shr j1q, 1
+    add j1q, 8
+    cmp j0q, j1q
+    jge .beginloop4
+.loop3:
+    add j0q, 12
+    cmp j0q, j1q
+    jge .endloop3
+ 
+    ;line{2*i,2*(i+1),2*(i+2),2*(i+3)} += F_LFTG_BETA*(line{2*i-1,2*(i+1)-1,2*(i+2)-1,2*(i+3)-1}+line{2*i+1,2*(i+1)+1,2*(i+2)+1,2*(i+3)+1})
+    movups m0, [lineq+2*j0q-28]
+    movups m4, [lineq+2*j0q-12]
+    movups m1, m0
+    shufps m0, m4, 0xDD
+    shufps m1, m4, 0x88
+    movups m2, [lineq+2*j0q-24]
+    movups m5, [lineq+2*j0q-8] 
+    shufps m2, m5, 0xDD
+    addps m2, m1
+    mulps m2, m3
+    addps m0, m2
+    movups m4, m1
+    shufps m1, m0, 0x44
+    shufps m1, m1, 0xD8
+    shufps m4, m0, 0xEE
+    shufps m4, m4, 0xD8
+    movups [lineq+2*j0q-28], m1
+    movups [lineq+2*j0q-12], m4
+
+    add j0q, 4
+    cmp j0q, j1q
+    jge .beginloop4
+    jmp .loop3
+  
+.endloop3:
+    sub j0q, 12
+.littleloop3:
+    movss m0, [lineq+2*j0q]
+    movss m1, [lineq+2*j0q-4]
+    movss m2, [lineq+2*j0q+4]
+    addss m1, m2
+    mulss m1, m3
+    addss m0, m1
+    movss [lineq+2*j0q], m0
+    add j0q, 4
+    cmp j0q, j1q
+    jl .littleloop3
+
+.beginloop4:
+    movups m3, [F_LFTG_ALPHA]
+    mov j0q, i0q
+    mov j1q, i1q
+    shr j0q, 1
+    sub j0q, 4
+    shr j1q, 1
+    add j1q, 4
+    cmp j0q, j1q
+    jge .end
+.loop4:
+    add j0q, 12
+    cmp j0q, j1q
+    jge .endloop4
+ 
+    ;line{2*i+1,2*(i+1)+1,2*(i+2)+1,2*(i+3)+1} += F_LFTG_ALPHA*(line{2*i,2*(i+1),2*(i+2),2*(i+3)}+line{2*i+2,2*(i+1)+2,2*(i+2)+2,2*(i+3)+2})
+    movups m0, [lineq+2*j0q-24]
+    movups m4, [lineq+2*j0q-8]
+    movups m1, m0
+    shufps m0, m4, 0xDD
+    shufps m1, m4, 0x88
+    movups m2, [lineq+2*j0q-20]
+    movups m5, [lineq+2*j0q-4] 
+    shufps m2, m5, 0xDD
+    addps m2, m1
+    mulps m2, m3
+    addps m0, m2
+    movups m4, m1
+    shufps m1, m0, 0x44
+    shufps m1, m1, 0xD8
+    shufps m4, m0, 0xEE
+    shufps m4, m4, 0xD8
+    movups [lineq+2*j0q-24], m1
+    movups [lineq+2*j0q-8], m4
+
+    add j0q, 4
+    cmp j0q, j1q
+    jge .end
+    jmp .loop4
+  
+.endloop4:
+    sub j0q, 12
+.littleloop4:
+    movss m0, [lineq+2*j0q+4]
+    movss m1, [lineq+2*j0q]
+    movss m2, [lineq+2*j0q+8]
+    addss m1, m2
+    mulss m1, m3
+    addss m0, m1
+    movss [lineq+2*j0q+4], m0
+    add j0q, 4
+    cmp j0q, j1q
+    jl .littleloop4
+
+.end:
+    REP_RET
+%endmacro
+    
+INIT_XMM sse
+SR1D97FLOAT 6
+
+%macro SR1D97FLOAT_ 5      ; p, i0, i1, tmp0, tmp1
+    mov %4, %2
+    mov %5, %3
+    add %4, 1
+    cmp %5, %4
+    jg .extend
+    sub %4, 2
+    jnz .else
+    movss m0, [%1+4]
+    movss m1, [F_LFTG_K]
+    movss m2, [TWO]
+    divss m1, m2
+    mulss m0, m1
+    movss [%1+4], m0
+    jmp .end
+
+.else:
+    movss m0, [%1]
+    movss m1, [F_LFTG_X]
+    mulss m0, m1
+    movss [%1], m0
+    jmp .end
+
+.extend:
+    shl %2, 2
+    shl %3, 2
+    mov %4, %2
+    mov %5, %3
+    movups m0, [%1+%4+4]
+    shufps m0, m0, 0x1B
+    movups [%1+%4-16], m0
+    movups m0, [%1+%5-20]
+    shufps m0, m0, 0x1B
+    movups [%1+%5], m0
+
+    movups m3, [F_LFTG_DELTA]
+    mov %4, %2
+    mov %5, %3
+    shr %4, 1
+    sub %4, 4
+    shr %5, 1
+    add %5, 8
+    cmp %4, %5
+    jge .beginloop2
+.loop1:
+    add %4, 12
+    cmp %4, %5
+    jge .endloop1
+ 
+    movups m0, [%1+2*%4-28]
+    movups m4, [%1+2*%4-12]
+    movups m1, m0
+    shufps m0, m4, 0xDD
+    shufps m1, m4, 0x88
+    movups m2, [%1+2*%4-24]
+    movups m5, [%1+2*%4-8] 
+    shufps m2, m5, 0xDD
+    addps m2, m1
+    mulps m2, m3
+    subps m0, m2
+    movups m4, m1
+    shufps m1, m0, 0x44
+    shufps m1, m1, 0xD8
+    shufps m4, m0, 0xEE
+    shufps m4, m4, 0xD8
+    movups [%1+2*%4-28], m1
+    movups [%1+2*%4-12], m4
+
+    add %4, 4
+    cmp %4, %5
+    jge .beginloop2
+    jmp .loop1
+  
+.endloop1:
+    sub %4, 12
+.littleloop1:
+    movss m0, [%1+2*%4]
+    movss m1, [%1+2*%4-4]
+    movss m2, [%1+2*%4+4]
+    addss m1, m2
+    mulss m1, m3
+    subss m0, m1
+    movss [%1+2*%4], m0
+    add %4, 4
+    cmp %4, %5
+    jl .littleloop1
+
+.beginloop2:
+    movups m3, [F_LFTG_GAMMA]
+    mov %4, %2
+    mov %5, %3
+    shr %4, 1
+    sub %4, 4
+    shr %5, 1
+    add %5, 4
+    cmp %4, %5
+    jge .beginloop3
+.loop2:
+    add %4, 12
+    cmp %4, %5
+    jge .endloop2
+ 
+    movups m0, [%1+2*%4-24]
+    movups m4, [%1+2*%4-8]
+    movups m1, m0
+    shufps m0, m4, 0xDD
+    shufps m1, m4, 0x88
+    movups m2, [%1+2*%4-20]
+    movups m5, [%1+2*%4-4] 
+    shufps m2, m5, 0xDD
+    addps m2, m1
+    mulps m2, m3
+    subps m0, m2
+    movups m4, m1
+    shufps m1, m0, 0x44
+    shufps m1, m1, 0xD8
+    shufps m4, m0, 0xEE
+    shufps m4, m4, 0xD8
+    movups [%1+2*%4-24], m1
+    movups [%1+2*%4-8], m4
+
+    add %4, 4
+    cmp %4, %5
+    jge .beginloop3
+    jmp .loop2
+  
+.endloop2:
+    sub %4, 12
+.littleloop2:
+    movss m0, [%1+2*%4+4]
+    movss m1, [%1+2*%4]
+    movss m2, [%1+2*%4+8]
+    addss m1, m2
+    mulss m1, m3
+    subss m0, m1
+    movss [%1+2*%4+4], m0
+    add %4, 4
+    cmp %4, %5
+    jl .littleloop2
+
+.beginloop3:
+    movups m3, [F_LFTG_BETA]
+    mov %4, %2
+    mov %5, %3
+    shr %4, 1
+    sub %4, 4
+    shr %5, 1
+    add %5, 8
+    cmp %4, %5
+    jge .beginloop4
+.loop3:
+    add %4, 12
+    cmp %4, %5
+    jge .endloop3
+
+    movups m0, [%1+2*%4-28]
+    movups m4, [%1+2*%4-12]
+    movups m1, m0
+    shufps m0, m4, 0xDD
+    shufps m1, m4, 0x88
+    movups m2, [%1+2*%4-24]
+    movups m5, [%1+2*%4-8] 
+    shufps m2, m5, 0xDD
+    addps m2, m1
+    mulps m2, m3
+    addps m0, m2
+    movups m4, m1
+    shufps m1, m0, 0x44
+    shufps m1, m1, 0xD8
+    shufps m4, m0, 0xEE
+    shufps m4, m4, 0xD8
+    movups [%1+2*%4-28], m1
+    movups [%1+2*%4-12], m4
+
+    add %4, 4
+    cmp %4, %5
+    jge .beginloop4
+    jmp .loop3
+  
+.endloop3:
+    sub %4, 12
+.littleloop3:
+    movss m0, [%1+2*%4]
+    movss m1, [%1+2*%4-4]
+    movss m2, [%1+2*%4+4]
+    addss m1, m2
+    mulss m1, m3
+    addss m0, m1
+    movss [%1+2*%4], m0
+    add %4, 4
+    cmp %4, %5
+    jl .littleloop3
+
+.beginloop4:
+    movups m3, [F_LFTG_ALPHA]
+    mov %4, %2
+    mov %5, %3
+    shr %4, 1
+    sub %4, 4
+    shr %5, 1
+    add %5, 4
+    cmp %4, %5
+    jge .end
+.loop4:
+    add %4, 12
+    cmp %4, %5
+    jge .endloop4
+ 
+    movups m0, [%1+2*%4-24]
+    movups m4, [%1+2*%4-8]
+    movups m1, m0
+    shufps m0, m4, 0xDD
+    shufps m1, m4, 0x88
+    movups m2, [%1+2*%4-20]
+    movups m5, [%1+2*%4-4] 
+    shufps m2, m5, 0xDD
+    addps m2, m1
+    mulps m2, m3
+    addps m0, m2
+    movups m4, m1
+    shufps m1, m0, 0x44
+    shufps m1, m1, 0xD8
+    shufps m4, m0, 0xEE
+    shufps m4, m4, 0xD8
+    movups [%1+2*%4-24], m1
+    movups [%1+2*%4-8], m4
+
+    add %4, 4
+    cmp %4, %5
+    jge .end
+    jmp .loop4
+  
+.endloop4:
+    sub %4, 12
+.littleloop4:
+    movss m0, [%1+2*%4+4]
+    movss m1, [%1+2*%4]
+    movss m2, [%1+2*%4+8]
+    addss m1, m2
+    mulss m1, m3
+    addss m0, m1
+    movss [%1+2*%4+4], m0
+    add %4, 4
+    cmp %4, %5
+    jl .littleloop4
+
+.end:
+    shr %2, 2
+    shr %3, 2
+%endmacro
+
+
+;***********************************************************************
+; ff_hor_sd_float_<opt>(float *line, float *data, int mh, int lh, int lv, int w)
+;***********************************************************************
+%macro HORSDFLOAT 1
+cglobal hor_sd_float, 6, 12, %1, line, data, mh, lh, lv, w, l, lp, i0, i1, j0, j1
+    mov lq, mhq
+    shl lq, 2
+    add lq, lineq
+    shl lhq, 2
+    
+    mov lpq, 0
+.mainloop:
+    ;j0 = w*lp+j
+    mov j0q, wq
+    imul j0q, lpq
+
+    ;j1 = (lh-mh+1)/2 + j0
+    mov j1q, lhq
+    shr j1q, 2
+    sub j1q, mhq
+    add j1q, 1
+    shr j1q, 1
+    add j1q, j0q
+
+    shl j0q, 2
+    shl j1q, 2
+
+    ;i1 = 1-mh
+    mov i1q, 1
+    sub i1q, mhq
+    shl i1q, 2
+
+    ;i0 = mh
+    mov i0q, mhq
+    shl i0q, 2
+ 
+    cmp i0q, i1q
+    jg .i1i0
+
+;i0 < i1
+    cmp i1q, lhq
+    jge .i0
+    
+    add i0q, 4
+    cmp i0q, i1q
+    jne .inci0
+ 
+;i1 = i0+1
+.beginloopi0i1   
+    sub i0q, 4
+
+.loopi0i1:
+    add i1q, 24
+    cmp i1q, lhq
+    jge .endloopi0i1
+
+    ;l{i0,i0+2,i0+4,i0+6} <- data[j0:j0+3]
+    ;l{i0,i0+3,i0+5,i0+7} = l{i1,i1+2,i1+4,i1+6} <- data[j1:j1+3]
+    movups m0, [dataq+j0q]
+    movups m2, [dataq+j1q]
+    movups m1, m0
+    shufps m0, m2, 0x44
+    shufps m0, m0, 0xD8
+    shufps m1, m2, 0xEE
+    shufps m1, m1, 0xD8
+    movups [lq+i0q], m0
+    movups [lq+i0q+16], m1
+
+    add i1q, 8
+    add i0q, 32
+    add j0q, 16
+    add j1q, 16
+    cmp i1q, lhq
+    jl .loopi0i1  
+    cmp i0q, lhq
+    jge .sr_1d
+
+    ;i1>=lh & i0<lh
+    movss m0, [dataq+j0q]
+    movss [lq+i0q], m0
+    jmp .sr_1d
+
+;i1 + 6 >= lh
+.endloopi0i1:
+    sub i1q, 24
+.littleloopi0i1:
+
+    ;l[i0] <- data[j0]
+    ;l[i1] <- data[j1]
+    movss m0, [dataq+j0q]
+    movss m1, [dataq+j1q]
+    movss [lq+i0q], m0
+    movss [lq+i1q], m1
+
+    add i0q, 8
+    add i1q, 8
+    add j0q, 4
+    add j1q, 4
+    cmp i1q, lhq
+    jl .littleloopi0i1
+    cmp i0q, lhq
+    jge .sr_1d
+
+    ;i1>=lh & i0<lh
+    movss m0, [dataq+j0q]
+    movss [lq+i0q], m0
+    jmp .sr_1d
+
+;i1 < i0
+.i1i0:
+    cmp i0q, lhq
+    jge .i1
+    
+    add i1q, 4
+    cmp i0q, i1q
+    jne .inci1
+
+;i0 = i1+1
+.beginloopi1i0    
+    sub i1q, 4
+
+.loopi1i0:
+    add i0q, 24
+    cmp i0q, lhq
+    jge .endloopi1i0
+
+    ;l{i1,i1+2,i1+4,i1+6} <- data[j1:j1+3]
+    ;l{i1,i1+3,i1+5,i1+7} = l{i0,i0+2,i0+4,i0+6} <- data[j0:j0+3]
+    movups m0, [dataq+j1q]
+    movups m2, [dataq+j0q]
+    movups m1, m0
+    shufps m0, m2, 0x44
+    shufps m0, m0, 0xD8
+    shufps m1, m2, 0xEE
+    shufps m1, m1, 0xD8
+    movups [lq+i1q], m0
+    movups [lq+i1q+16], m1
+
+    add i0q, 8
+    add i1q, 32
+    add j0q, 16
+    add j1q, 16
+    cmp i0q, lhq
+    jl .loopi1i0  
+    cmp i1q, lhq
+    jge .sr_1d
+
+    ;i0>=lh & i1<lh
+    movss m0, [dataq+j1q]
+    movss [lq+i1q], m0
+    jmp .sr_1d
+
+.endloopi1i0:
+    sub i1q, 24
+.littleloopi1i0:
+
+    ;l[i0] <- data[j0]
+    ;l[i1] <- data[j1]
+    movss m0, [dataq+j1q]
+    movss m1, [dataq+j0q]
+    movss [lq+i1q], m0
+    movss [lq+i0q], m1
+
+    add i0q, 8
+    add i1q, 8
+    add j0q, 4
+    add j1q, 4
+    cmp i1q, lhq
+    jl .littleloopi1i0
+    cmp i0q, lhq
+    jge .sr_1d
+
+    ;i0>=lh & i1<lh
+    movss m0, [dataq+j0q]
+    movss [lq+i0q], m0
+    jmp .sr_1d
+
+;i0<i1 & i1>=lh
+.i0:
+    cmp i0q, lhq
+    jge .sr_1d
+    movss m0, [dataq+j0q]
+    movss [lq+i0q], m0
+    add i0q, 8
+    add j0q, 4
+    jmp .i0
+
+;i1<i0 & i0>=lh
+.i1:
+    cmp i1q, lhq
+    jge .sr_1d
+    movss m0, [dataq+j1q]
+    movss [lq+i1q], m0
+    add i1q, 8
+    add j1q, 4
+    jmp .i1
+
+;i0 < i1-1
+.inci0:
+    cmp i0q, lhq
+    jge .sr_1d
+    movss m0, [dataq+j0q]
+    movss [lq+i0q-4], m0
+    add i0q, 8
+    add j0q, 4
+    cmp i0q, i1q
+    je .beginloopi0i1
+    jmp .inci0
+
+;i1 < i0-1
+.inci1:
+    cmp i1q, lhq
+    jge .sr_1d
+    movss m0, [dataq+j1q]
+    movss [lq+i1q-4], m0
+    add i1q, 8
+    add j1q, 4
+    cmp i0q, i1q
+    je .beginloopi1i0
+    jmp .inci1
+
+.sr_1d:
+    mov i0q, mhq
+    mov i1q, lhq
+    shr i1q, 2
+    add i1q, mhq
+    SR1D97FLOAT_ lineq, i0q, i1q, j0q, j1q
+
+    mov i0q, 0
+    cmp i0q, lhq
+    jge .endmainloop
+    mov j0q, wq
+    imul j0q, lpq
+    shl j0q, 2
+.subloop3:
+    add i0q, 12
+    cmp i0q, lhq
+    jge .endsubloop3
+
+    movups m0, [lq+i0q-12]
+    movups [dataq+j0q], m0
+
+    add i0q, 4
+    add j0q, 16
+    cmp i0q, lhq
+    jge .endmainloop
+    jmp .subloop3  
+
+.endsubloop3:
+    sub i0q, 12
+.littlesubloop3:
+    movss m0, [lq+i0q]
+    movss [dataq+j0q], m0
+
+    add i0q, 4
+    add j0q, 4
+    cmp i0q, lhq
+    jl .littlesubloop3  
+
+.endmainloop:
+    add lpq, 1
+    cmp lpq, lvq
+    jl .mainloop
+
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+HORSDFLOAT 6
diff --git a/libavcodec/x86/jpeg2000dsp_init.c b/libavcodec/x86/jpeg2000dsp_init.c
index baa81383ea..177330ea47 100644
--- a/libavcodec/x86/jpeg2000dsp_init.c
+++ b/libavcodec/x86/jpeg2000dsp_init.c
@@ -23,12 +23,16 @@
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/jpeg2000dsp.h"
+#include "libavcodec/jpeg2000dwt.h"
 
 void ff_ict_float_sse(void *src0, void *src1, void *src2, int csize);
 void ff_ict_float_avx(void *src0, void *src1, void *src2, int csize);
 void ff_rct_int_sse2 (void *src0, void *src1, void *src2, int csize);
 void ff_rct_int_avx2 (void *src0, void *src1, void *src2, int csize);
 
+void ff_sr_1d97_float_sse(float *line, int i0, int i1);
+void ff_hor_sd_float_sse(float *line, float *data, int mh, int lh, int lv, int w);
+
 av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -48,3 +52,54 @@ av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c)
         c->mct_decode[FF_DWT53] = ff_rct_int_avx2;
     }
 }
+
+av_cold void ff_jpeg2000dwt_init_x86(DWTContext *s, int type)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (EXTERNAL_SSE(cpu_flags)) {
+        if (type == FF_DWT97){
+            dwt_decode = dwt_decode97_float_sse;
+        }
+    }
+}
+
+void dwt_decode97_float_sse(DWTContext *s, float *t)
+{
+    int lev;
+    int w       = s->linelen[s->ndeclevels - 1][0];
+    float *line = s->f_linebuf;
+    float *data = t;
+    /* position at index O of line range [0-5,w+5] cf. extend function */
+    line += 5;
+
+    int i, j = 0;
+
+    for (lev = 0; lev < s->ndeclevels; lev++) {
+        int lh = s->linelen[lev][0],
+            lv = s->linelen[lev][1],
+            mh = s->mod[lev][0],
+            mv = s->mod[lev][1],
+            lp;
+        float *l;
+        // HOR_SD
+        ff_hor_sd_float_sse(line, data, mh, lh, lv, w);
+
+        // VER_SD
+        l = line + mv;
+        for (lp = 0; lp < lh; lp++) {
+            //printf("hello \n");
+            i = 0;
+            j = 0;
+            // copy with interleaving
+            for (i = mv; i < lv; i += 2, j++)
+                l[i] = data[w * j + lp];
+            for (i = 1 - mv; i < lv; i += 2, j++)
+                l[i] = data[w * j + lp];
+
+            ff_sr_1d97_float_sse(line, mv, mv + lv);
+
+            for (i = 0; i < lv; i++)
+                data[w * i + lp] = l[i];
+        }
+    }
+}
-- 
2.11.0



More information about the ffmpeg-devel mailing list