[FFmpeg-cvslog] lavu/tx: make C ptwo transforms in+out of place

Lynne git at videolan.org
Thu Nov 24 16:59:18 EET 2022


ffmpeg | branch: master | Lynne <dev at lynne.ee> | Thu Nov 17 20:03:09 2022 +0100| [d260796f119682274c83e2f1465f56f3e314c4a4] | committer: Lynne

lavu/tx: make C ptwo transforms in+out of place

We assume that _all_ in-place transforms can operate out of place,
which isn't true, because the C ptwo transforms were always in-place (dst).

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=d260796f119682274c83e2f1465f56f3e314c4a4
---

 libavutil/tx_template.c | 117 +++++++++++++++++++++++++-----------------------
 1 file changed, 61 insertions(+), 56 deletions(-)

diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c
index f53a241248..2a8afcb02a 100644
--- a/libavutil/tx_template.c
+++ b/libavutil/tx_template.c
@@ -611,8 +611,8 @@ static const FFTXCodelet TX_NAME(ff_tx_fft##n##_ns_def) = { \
     .name       = TX_NAME_STR("fft" #n "_ns"),              \
     .function   = TX_NAME(ff_tx_fft##n##_ns),               \
     .type       = TX_TYPE(FFT),                             \
-    .flags      = AV_TX_INPLACE | AV_TX_UNALIGNED |         \
-                  FF_TX_PRESHUFFLE,                         \
+    .flags      = FF_TX_OUT_OF_PLACE | AV_TX_INPLACE |      \
+                  AV_TX_UNALIGNED | FF_TX_PRESHUFFLE,       \
     .factors[0] = 2,                                        \
     .min_len    = n,                                        \
     .max_len    = n,                                        \
@@ -621,70 +621,75 @@ static const FFTXCodelet TX_NAME(ff_tx_fft##n##_ns_def) = { \
     .prio       = FF_TX_PRIO_BASE,                          \
 };
 
-#define DECL_SR_CODELET(n, n2, n4)                                   \
-static void TX_NAME(ff_tx_fft##n##_ns)(AVTXContext *s, void *dst,    \
-                                        void *src, ptrdiff_t stride) \
-{                                                                    \
-    TXComplex *z = dst;                                              \
-    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                     \
-                                                                     \
-    TX_NAME(ff_tx_fft##n2##_ns)(s, z,        z,        stride);      \
-    TX_NAME(ff_tx_fft##n4##_ns)(s, z + n4*2, z + n4*2, stride);      \
-    TX_NAME(ff_tx_fft##n4##_ns)(s, z + n4*3, z + n4*3, stride);      \
-    TX_NAME(ff_tx_fft_sr_combine)(z, cos, n4 >> 1);                  \
-}                                                                    \
-                                                                     \
+#define DECL_SR_CODELET(n, n2, n4)                                    \
+static void TX_NAME(ff_tx_fft##n##_ns)(AVTXContext *s, void *_dst,    \
+                                        void *_src, ptrdiff_t stride) \
+{                                                                     \
+    TXComplex *src = _src;                                            \
+    TXComplex *dst = _dst;                                            \
+    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
+                                                                      \
+    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
+    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
+    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
+    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
+}                                                                     \
+                                                                      \
 DECL_SR_CODELET_DEF(n)
 
-static void TX_NAME(ff_tx_fft2_ns)(AVTXContext *s, void *dst,
-                                   void *src, ptrdiff_t stride)
+static void TX_NAME(ff_tx_fft2_ns)(AVTXContext *s, void *_dst,
+                                   void *_src, ptrdiff_t stride)
 {
-    TXComplex *z = dst;
+    TXComplex *src = _src;
+    TXComplex *dst = _dst;
     TXComplex tmp;
 
-    BF(tmp.re, z[0].re, z[0].re, z[1].re);
-    BF(tmp.im, z[0].im, z[0].im, z[1].im);
-    z[1] = tmp;
+    BF(tmp.re, dst[0].re, src[0].re, src[1].re);
+    BF(tmp.im, dst[0].im, src[0].im, src[1].im);
+    dst[1] = tmp;
 }
 
-static void TX_NAME(ff_tx_fft4_ns)(AVTXContext *s, void *dst,
-                                   void *src, ptrdiff_t stride)
+static void TX_NAME(ff_tx_fft4_ns)(AVTXContext *s, void *_dst,
+                                   void *_src, ptrdiff_t stride)
 {
-    TXComplex *z = dst;
+    TXComplex *src = _src;
+    TXComplex *dst = _dst;
     TXSample t1, t2, t3, t4, t5, t6, t7, t8;
 
-    BF(t3, t1, z[0].re, z[1].re);
-    BF(t8, t6, z[3].re, z[2].re);
-    BF(z[2].re, z[0].re, t1, t6);
-    BF(t4, t2, z[0].im, z[1].im);
-    BF(t7, t5, z[2].im, z[3].im);
-    BF(z[3].im, z[1].im, t4, t8);
-    BF(z[3].re, z[1].re, t3, t7);
-    BF(z[2].im, z[0].im, t2, t5);
+    BF(t3, t1, src[0].re, src[1].re);
+    BF(t8, t6, src[3].re, src[2].re);
+    BF(dst[2].re, dst[0].re, t1, t6);
+    BF(t4, t2, src[0].im, src[1].im);
+    BF(t7, t5, src[2].im, src[3].im);
+    BF(dst[3].im, dst[1].im, t4, t8);
+    BF(dst[3].re, dst[1].re, t3, t7);
+    BF(dst[2].im, dst[0].im, t2, t5);
 }
 
-static void TX_NAME(ff_tx_fft8_ns)(AVTXContext *s, void *dst,
-                                   void *src, ptrdiff_t stride)
+static void TX_NAME(ff_tx_fft8_ns)(AVTXContext *s, void *_dst,
+                                   void *_src, ptrdiff_t stride)
 {
-    TXComplex *z = dst;
+    TXComplex *src = _src;
+    TXComplex *dst = _dst;
     TXSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
     const TXSample cos = TX_TAB(ff_tx_tab_8)[1];
 
-    TX_NAME(ff_tx_fft4_ns)(s, z, z, stride);
+    TX_NAME(ff_tx_fft4_ns)(s, dst, src, stride);
 
-    BF(t1, z[5].re, z[4].re, -z[5].re);
-    BF(t2, z[5].im, z[4].im, -z[5].im);
-    BF(t5, z[7].re, z[6].re, -z[7].re);
-    BF(t6, z[7].im, z[6].im, -z[7].im);
+    BF(t1, dst[5].re, src[4].re, -src[5].re);
+    BF(t2, dst[5].im, src[4].im, -src[5].im);
+    BF(t5, dst[7].re, src[6].re, -src[7].re);
+    BF(t6, dst[7].im, src[6].im, -src[7].im);
 
-    BUTTERFLIES(z[0], z[2], z[4], z[6]);
-    TRANSFORM(z[1], z[3], z[5], z[7], cos, cos);
+    BUTTERFLIES(dst[0], dst[2], dst[4], dst[6]);
+    TRANSFORM(dst[1], dst[3], dst[5], dst[7], cos, cos);
 }
 
-static void TX_NAME(ff_tx_fft16_ns)(AVTXContext *s, void *dst,
-                                    void *src, ptrdiff_t stride)
+static void TX_NAME(ff_tx_fft16_ns)(AVTXContext *s, void *_dst,
+                                    void *_src, ptrdiff_t stride)
 {
-    TXComplex *z = dst;
+    TXComplex *src = _src;
+    TXComplex *dst = _dst;
     const TXSample *cos = TX_TAB(ff_tx_tab_16);
 
     TXSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
@@ -692,19 +697,19 @@ static void TX_NAME(ff_tx_fft16_ns)(AVTXContext *s, void *dst,
     TXSample cos_16_2 = cos[2];
     TXSample cos_16_3 = cos[3];
 
-    TX_NAME(ff_tx_fft8_ns)(s, z +  0, z +  0, stride);
-    TX_NAME(ff_tx_fft4_ns)(s, z +  8, z +  8, stride);
-    TX_NAME(ff_tx_fft4_ns)(s, z + 12, z + 12, stride);
+    TX_NAME(ff_tx_fft8_ns)(s, dst +  0, src +  0, stride);
+    TX_NAME(ff_tx_fft4_ns)(s, dst +  8, src +  8, stride);
+    TX_NAME(ff_tx_fft4_ns)(s, dst + 12, src + 12, stride);
 
-    t1 = z[ 8].re;
-    t2 = z[ 8].im;
-    t5 = z[12].re;
-    t6 = z[12].im;
-    BUTTERFLIES(z[0], z[4], z[8], z[12]);
+    t1 = dst[ 8].re;
+    t2 = dst[ 8].im;
+    t5 = dst[12].re;
+    t6 = dst[12].im;
+    BUTTERFLIES(dst[0], dst[4], dst[8], dst[12]);
 
-    TRANSFORM(z[ 2], z[ 6], z[10], z[14], cos_16_2, cos_16_2);
-    TRANSFORM(z[ 1], z[ 5], z[ 9], z[13], cos_16_1, cos_16_3);
-    TRANSFORM(z[ 3], z[ 7], z[11], z[15], cos_16_3, cos_16_1);
+    TRANSFORM(dst[ 2], dst[ 6], dst[10], dst[14], cos_16_2, cos_16_2);
+    TRANSFORM(dst[ 1], dst[ 5], dst[ 9], dst[13], cos_16_1, cos_16_3);
+    TRANSFORM(dst[ 3], dst[ 7], dst[11], dst[15], cos_16_3, cos_16_1);
 }
 
 DECL_SR_CODELET_DEF(2)



More information about the ffmpeg-cvslog mailing list