[FFmpeg-cvslog] avfilter/avf_showcqt: cqt_calc optimization on x86
Muhammad Faiz
git at videolan.org
Wed Jun 8 11:15:29 CEST 2016
ffmpeg | branch: master | Muhammad Faiz <mfcc64 at gmail.com> | Sat Jun 4 14:33:05 2016 +0700| [1e69ac9246be8c9a1bf595e7fe949df5bc541c55] | committer: Muhammad Faiz
avfilter/avf_showcqt: cqt_calc optimization on x86
on x86_64:
time PSNR
plain 3.303 inf
SSE 1.649 107.087535
SSE3 1.632 107.087535
AVX 1.409 106.986771
FMA3 1.265 107.108437
on x86_32 (PSNR compared to x86_64 plain):
time PSNR
plain 7.225 103.951979
SSE 1.827 105.859282
SSE3 1.819 105.859282
AVX 1.533 105.997661
FMA3 1.384 105.885377
FMA4 test is not available
Reviewed-by: James Almer <jamrial at gmail.com>
Signed-off-by: Muhammad Faiz <mfcc64 at gmail.com>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=1e69ac9246be8c9a1bf595e7fe949df5bc541c55
---
libavfilter/avf_showcqt.c | 7 ++
libavfilter/avf_showcqt.h | 3 +
libavfilter/x86/Makefile | 2 +
libavfilter/x86/avf_showcqt.asm | 206 ++++++++++++++++++++++++++++++++++++
libavfilter/x86/avf_showcqt_init.c | 63 +++++++++++
5 files changed, 281 insertions(+)
diff --git a/libavfilter/avf_showcqt.c b/libavfilter/avf_showcqt.c
index b88c83c..62d5b09 100644
--- a/libavfilter/avf_showcqt.c
+++ b/libavfilter/avf_showcqt.c
@@ -320,6 +320,9 @@ static int init_cqt(ShowCQTContext *s)
w *= sign * (1.0 / s->fft_len);
s->coeffs[m].val[x - s->coeffs[m].start] = w;
}
+
+ if (s->permute_coeffs)
+ s->permute_coeffs(s->coeffs[m].val, s->coeffs[m].len);
}
av_expr_free(expr);
@@ -1230,6 +1233,7 @@ static int config_output(AVFilterLink *outlink)
s->cqt_align = 1;
s->cqt_calc = cqt_calc;
+ s->permute_coeffs = NULL;
s->draw_sono = draw_sono;
if (s->format == AV_PIX_FMT_RGB24) {
s->draw_bar = draw_bar_rgb;
@@ -1241,6 +1245,9 @@ static int config_output(AVFilterLink *outlink)
s->update_sono = update_sono_yuv;
}
+ if (ARCH_X86)
+ ff_showcqt_init_x86(s);
+
if ((ret = init_cqt(s)) < 0)
return ret;
diff --git a/libavfilter/avf_showcqt.h b/libavfilter/avf_showcqt.h
index b945f49..588830f 100644
--- a/libavfilter/avf_showcqt.h
+++ b/libavfilter/avf_showcqt.h
@@ -74,6 +74,7 @@ typedef struct {
/* callback */
void (*cqt_calc)(FFTComplex *dst, const FFTComplex *src, const Coeffs *coeffs,
int len, int fft_len);
+ void (*permute_coeffs)(float *v, int len);
void (*draw_bar)(AVFrame *out, const float *h, const float *rcp_h,
const ColorFloat *c, int bar_h);
void (*draw_axis)(AVFrame *out, AVFrame *axis, const ColorFloat *c, int off);
@@ -112,4 +113,6 @@ typedef struct {
int axis;
} ShowCQTContext;
+void ff_showcqt_init_x86(ShowCQTContext *s);
+
#endif
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 4486b79..b6195f8 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -13,6 +13,7 @@ OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7_init.o
OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr_init.o
OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o
OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain_init.o
+OBJS-$(CONFIG_SHOWCQT_FILTER) += x86/avf_showcqt_init.o
OBJS-$(CONFIG_SPP_FILTER) += x86/vf_spp.o
OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim_init.o
OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d_init.o
@@ -37,6 +38,7 @@ YASM-OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup.o
ifdef CONFIG_GPL
YASM-OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain.o
endif
+YASM-OBJS-$(CONFIG_SHOWCQT_FILTER) += x86/avf_showcqt.o
YASM-OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim.o
YASM-OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d.o
YASM-OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend.o
diff --git a/libavfilter/x86/avf_showcqt.asm b/libavfilter/x86/avf_showcqt.asm
new file mode 100644
index 0000000..6dac0a7
--- /dev/null
+++ b/libavfilter/x86/avf_showcqt.asm
@@ -0,0 +1,206 @@
+;*****************************************************************************
+;* x86-optimized functions for showcqt filter
+;*
+;* Copyright (C) 2016 Muhammad Faiz <mfcc64 at gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+%if ARCH_X86_64
+%define pointer resq
+%else
+%define pointer resd
+%endif
+
+struc Coeffs
+ .val: pointer 1
+ .start: resd 1
+ .len: resd 1
+ .sizeof:
+endstruc
+
+%macro EMULATE_HADDPS 3 ; dst, src, tmp
+%if cpuflag(sse3)
+ haddps %1, %2
+%else
+ movaps %3, %1
+ shufps %1, %2, q2020
+ shufps %3, %2, q3131
+ addps %1, %3
+%endif
+%endmacro ; EMULATE_HADDPS
+
+%macro EMULATE_FMADDPS 5 ; dst, src1, src2, src3, tmp
+%if cpuflag(fma3) || cpuflag(fma4)
+ fmaddps %1, %2, %3, %4
+%else
+ mulps %5, %2, %3
+ addps %1, %4, %5
+%endif
+%endmacro ; EMULATE_FMADDPS
+
+%macro CQT_CALC 9
+; %1 = a_re, %2 = a_im, %3 = b_re, %4 = b_im
+; %5 = m_re, %6 = m_im, %7 = tmp, %8 = coeffval, %9 = coeffsq_offset
+ mov id, xd
+ add id, [coeffsq + Coeffs.start + %9]
+ movaps m%5, [srcq + 8 * iq]
+ movaps m%7, [srcq + 8 * iq + mmsize]
+ shufps m%6, m%5, m%7, q3131
+ shufps m%5, m%5, m%7, q2020
+ sub id, fft_lend
+ EMULATE_FMADDPS m%2, m%6, m%8, m%2, m%6
+ neg id
+ EMULATE_FMADDPS m%1, m%5, m%8, m%1, m%5
+ movups m%5, [srcq + 8 * iq - mmsize + 8]
+ movups m%7, [srcq + 8 * iq - 2*mmsize + 8]
+ %if mmsize == 32
+ vperm2f128 m%5, m%5, m%5, 1
+ vperm2f128 m%7, m%7, m%7, 1
+ %endif
+ shufps m%6, m%5, m%7, q1313
+ shufps m%5, m%5, m%7, q0202
+ EMULATE_FMADDPS m%4, m%6, m%8, m%4, m%6
+ EMULATE_FMADDPS m%3, m%5, m%8, m%3, m%5
+%endmacro ; CQT_CALC
+
+%macro CQT_SEPARATE 6 ; a_re, a_im, b_re, b_im, tmp, tmp2
+ addps m%5, m%4, m%2
+ subps m%6, m%3, m%1
+ addps m%1, m%3
+ subps m%2, m%4
+ EMULATE_HADDPS m%5, m%6, m%3
+ EMULATE_HADDPS m%1, m%2, m%3
+ EMULATE_HADDPS m%1, m%5, m%2
+ %if mmsize == 32
+ vextractf128 xmm%2, m%1, 1
+ addps xmm%1, xmm%2
+ %endif
+%endmacro ; CQT_SEPARATE
+
+%macro DECLARE_CQT_CALC 0
+; ff_showcqt_cqt_calc_*(dst, src, coeffs, len, fft_len)
+%if ARCH_X86_64
+cglobal showcqt_cqt_calc, 5, 10, 12, dst, src, coeffs, len, fft_len, x, coeffs_val, coeffs_val2, i, coeffs_len
+ align 16
+ .loop_k:
+ mov xd, [coeffsq + Coeffs.len]
+ xorps m0, m0
+ movaps m1, m0
+ movaps m2, m0
+ mov coeffs_lend, [coeffsq + Coeffs.len + Coeffs.sizeof]
+ movaps m3, m0
+ movaps m8, m0
+ cmp coeffs_lend, xd
+ movaps m9, m0
+ movaps m10, m0
+ movaps m11, m0
+ cmova coeffs_lend, xd
+ xor xd, xd
+ test coeffs_lend, coeffs_lend
+ jz .check_loop_b
+ mov coeffs_valq, [coeffsq + Coeffs.val]
+ mov coeffs_val2q, [coeffsq + Coeffs.val + Coeffs.sizeof]
+ align 16
+ .loop_ab:
+ movaps m7, [coeffs_valq + 4 * xq]
+ CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
+ movaps m7, [coeffs_val2q + 4 * xq]
+ CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof
+ add xd, mmsize/4
+ cmp xd, coeffs_lend
+ jb .loop_ab
+ .check_loop_b:
+ cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof]
+ jae .check_loop_a
+ align 16
+ .loop_b:
+ movaps m7, [coeffs_val2q + 4 * xq]
+ CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof
+ add xd, mmsize/4
+ cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof]
+ jb .loop_b
+ .loop_end:
+ CQT_SEPARATE 0, 1, 2, 3, 4, 5
+ CQT_SEPARATE 8, 9, 10, 11, 4, 5
+ mulps xmm0, xmm0
+ mulps xmm8, xmm8
+ EMULATE_HADDPS xmm0, xmm8, xmm1
+ movaps [dstq], xmm0
+ sub lend, 2
+ lea dstq, [dstq + 16]
+ lea coeffsq, [coeffsq + 2*Coeffs.sizeof]
+ jnz .loop_k
+ REP_RET
+ align 16
+ .check_loop_a:
+ cmp xd, [coeffsq + Coeffs.len]
+ jae .loop_end
+ align 16
+ .loop_a:
+ movaps m7, [coeffs_valq + 4 * xq]
+ CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
+ add xd, mmsize/4
+ cmp xd, [coeffsq + Coeffs.len]
+ jb .loop_a
+ jmp .loop_end
+%else
+cglobal showcqt_cqt_calc, 4, 7, 8, dst, src, coeffs, len, x, coeffs_val, i
+%define fft_lend r4m
+ align 16
+ .loop_k:
+ mov xd, [coeffsq + Coeffs.len]
+ xorps m0, m0
+ movaps m1, m0
+ movaps m2, m0
+ movaps m3, m0
+ test xd, xd
+ jz .store
+ mov coeffs_valq, [coeffsq + Coeffs.val]
+ xor xd, xd
+ align 16
+ .loop_x:
+ movaps m7, [coeffs_valq + 4 * xq]
+ CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
+ add xd, mmsize/4
+ cmp xd, [coeffsq + Coeffs.len]
+ jb .loop_x
+ CQT_SEPARATE 0, 1, 2, 3, 4, 5
+ mulps xmm0, xmm0
+ EMULATE_HADDPS xmm0, xmm0, xmm1
+ .store:
+ movlps [dstq], xmm0
+ sub lend, 1
+ lea dstq, [dstq + 8]
+ lea coeffsq, [coeffsq + Coeffs.sizeof]
+ jnz .loop_k
+ REP_RET
+%endif ; ARCH_X86_64
+%endmacro ; DECLARE_CQT_CALC
+
+INIT_XMM sse
+DECLARE_CQT_CALC
+INIT_XMM sse3
+DECLARE_CQT_CALC
+INIT_YMM avx
+DECLARE_CQT_CALC
+INIT_YMM fma3
+DECLARE_CQT_CALC
+INIT_XMM fma4
+DECLARE_CQT_CALC
diff --git a/libavfilter/x86/avf_showcqt_init.c b/libavfilter/x86/avf_showcqt_init.c
new file mode 100644
index 0000000..0cc164c
--- /dev/null
+++ b/libavfilter/x86/avf_showcqt_init.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2016 Muhammad Faiz <mfcc64 at gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/avf_showcqt.h"
+
+#define DECLARE_CQT_CALC(type) \
+void ff_showcqt_cqt_calc_##type(FFTComplex *dst, const FFTComplex *src, \
+ const Coeffs *coeffs, int len, int fft_len)
+
+DECLARE_CQT_CALC(sse);
+DECLARE_CQT_CALC(sse3);
+DECLARE_CQT_CALC(avx);
+DECLARE_CQT_CALC(fma3);
+DECLARE_CQT_CALC(fma4);
+
+#define permute_coeffs_0 NULL
+
+static void permute_coeffs_01452367(float *v, int len)
+{
+ int k;
+ for (k = 0; k < len; k += 8) {
+ FFSWAP(float, v[k+2], v[k+4]);
+ FFSWAP(float, v[k+3], v[k+5]);
+ }
+}
+
+av_cold void ff_showcqt_init_x86(ShowCQTContext *s)
+{
+ int cpuflags = av_get_cpu_flags();
+
+#define SELECT_CQT_CALC(type, TYPE, align, perm) \
+if (EXTERNAL_##TYPE(cpuflags)) { \
+ s->cqt_calc = ff_showcqt_cqt_calc_##type; \
+ s->cqt_align = align; \
+ s->permute_coeffs = permute_coeffs_##perm; \
+}
+
+ SELECT_CQT_CALC(sse, SSE, 4, 0);
+ SELECT_CQT_CALC(sse3, SSE3_FAST, 4, 0);
+ SELECT_CQT_CALC(fma4, FMA4, 4, 0); // using xmm
+ SELECT_CQT_CALC(avx, AVX_FAST, 8, 01452367);
+ SELECT_CQT_CALC(fma3, FMA3_FAST, 8, 01452367);
+}
More information about the ffmpeg-cvslog
mailing list