[FFmpeg-devel] [PATCH 07/16] hwcontext_nvtegra: add dynamic frequency scaling routines
averne
averne381 at gmail.com
Thu May 30 22:43:09 EEST 2024
To save on energy, the clock speed of multimedia engines should be adapted to their workload.
Signed-off-by: averne <averne381 at gmail.com>
---
libavutil/hwcontext_nvtegra.c | 165 ++++++++++++++++++++++++++++++++++
libavutil/hwcontext_nvtegra.h | 7 ++
2 files changed, 172 insertions(+)
diff --git a/libavutil/hwcontext_nvtegra.c b/libavutil/hwcontext_nvtegra.c
index 0f4d5a323b..6b72348082 100644
--- a/libavutil/hwcontext_nvtegra.c
+++ b/libavutil/hwcontext_nvtegra.c
@@ -46,6 +46,14 @@ typedef struct NVTegraDevicePriv {
AVNVTegraJobPool job_pool;
uint32_t vic_setup_off, vic_cmdbuf_off;
+
+ double framerate;
+ uint32_t dfs_lowcorner;
+ double dfs_decode_cycles_ema;
+ double dfs_ema_damping;
+ int dfs_bitrate_sum;
+ int dfs_cur_sample, dfs_num_samples;
+ int64_t dfs_sampling_start_ts, dfs_last_ts_delta;
} NVTegraDevicePriv;
static const enum AVPixelFormat supported_sw_formats[] = {
@@ -108,6 +116,28 @@ static inline uint32_t nvtegra_surface_get_height_align(enum AVPixelFormat fmt,
return 32;
}
+static int nvtegra_channel_set_freq(AVNVTegraChannel *channel, uint32_t freq) {
+ int err;
+#ifndef __SWITCH__
+ err = av_nvtegra_channel_set_clock_rate(channel, channel->module_id, freq);
+ if (err < 0)
+ return err;
+
+ err = av_nvtegra_channel_get_clock_rate(channel, channel->module_id, &channel->clock);
+ if (err < 0)
+ return err;
+#else
+ err = AVERROR(mmuRequestSetAndWait(&channel->mmu_request, freq, -1));
+ if (err < 0)
+ return err;
+
+ err = AVERROR(mmuRequestGet(&channel->mmu_request, &channel->clock));
+ if (err < 0)
+ return err;
+#endif
+ return 0;
+}
+
static void nvtegra_device_uninit(AVHWDeviceContext *ctx) {
NVTegraDevicePriv *priv = ctx->hwctx;
AVNVTegraDeviceContext *hwctx = &priv->p;
@@ -386,6 +416,141 @@ static int nvtegra_get_buffer(AVHWFramesContext *ctx, AVFrame *frame) {
return 0;
}
+/*
+ * Possible frequencies on Icosa and Mariko+, in MHz
+ * (see tegra210-core-dvfs.c and tegra210b01-core-dvfs.c in l4t kernel sources, respectively):
+ * for NVDEC:
+ * 268.8, 384.0, 448.0, 486.4, 550.4, 576.0, 614.4, 652.8, 678.4, 691.2, 716.8
+ * 460.8, 499.2, 556.8, 633.6, 652.8, 710.4, 748.8, 787.2, 825.6, 844.8, 883.2, 902.4, 921.6, 940.8, 960.0, 979.2
+ * for NVJPG:
+ * 192.0, 307.2, 345.6, 409.6, 486.4, 524.8, 550.4, 576.0, 588.8, 614.4, 627.2
+ * 422.4, 441.6, 499.2, 518.4, 537.6, 556.8, 576.0, 595.2, 614.4, 633.6, 652.8
+ */
+
+int av_nvtegra_dfs_init(AVHWDeviceContext *ctx, AVNVTegraChannel *channel, int width, int height,
+ double framerate_hz)
+{
+ NVTegraDevicePriv *priv = ctx->hwctx;
+
+ uint32_t max_freq, lowcorner;
+ int num_mbs, err;
+
+ priv->dfs_num_samples = 20;
+ priv->dfs_ema_damping = 0.1;
+
+ /*
+ * Initialize low-corner frequency (reproduces official code)
+ * Framerate might be unavailable (or variable), but this is official logic
+ */
+ num_mbs = width / 16 * height / 16;
+ if (num_mbs <= 3600)
+ lowcorner = 100000000; /* 480p */
+ else if (num_mbs <= 8160)
+ lowcorner = 180000000; /* 720p */
+ else if (num_mbs <= 32400)
+ lowcorner = 345000000; /* 1080p */
+ else
+ lowcorner = 576000000; /* 4k */
+
+ if (framerate_hz >= 0.1 && isfinite(framerate_hz))
+ lowcorner = FFMIN(lowcorner, lowcorner * framerate_hz / 30.0);
+
+ priv->framerate = framerate_hz;
+ priv->dfs_lowcorner = lowcorner;
+
+ av_log(ctx, AV_LOG_DEBUG, "DFS: Initializing lowcorner to %d Hz, using %u samples\n",
+ priv->dfs_lowcorner, priv->dfs_num_samples);
+
+ /*
+ * Initialize channel to the max possible frequency (the kernel driver will clamp to an allowed value)
+ * Note: Official code passes INT_MAX kHz then multiplies by 1000 (to Hz) and converts to u32,
+ * resulting in this value.
+ */
+ max_freq = (UINT64_C(1)<<32) - 1000 & UINT32_MAX;
+
+ err = nvtegra_channel_set_freq(channel, max_freq);
+ if (err < 0)
+ return err;
+
+ priv->dfs_decode_cycles_ema = 0.0;
+ priv->dfs_bitrate_sum = 0;
+ priv->dfs_cur_sample = 0;
+ priv->dfs_sampling_start_ts = av_gettime_relative();
+ priv->dfs_last_ts_delta = 0;
+
+ return 0;
+}
+
+int av_nvtegra_dfs_update(AVHWDeviceContext *ctx, AVNVTegraChannel *channel, int bitstream_len, int decode_cycles) {
+ NVTegraDevicePriv *priv = ctx->hwctx;
+
+ double frame_time, avg;
+ int64_t now, wl_dt;
+ uint32_t clock;
+ int err;
+
+ /*
+ * Official software implements DFS using a flat average of the decoder pool occupancy.
+ * We instead use the decode cycles as reported by NVDEC microcode, and the "bitrate"
+ * (bitstream bits fed to the hardware in a given clock time interval, NOT video time),
+ * to calculate a suitable frequency, and multiply it by 1.2 for good measure:
+ * Freq = decode_cycles_per_bit * bits_per_second * 1.2
+ */
+
+ /* Convert to bits */
+ bitstream_len *= 8;
+
+ /* Exponential moving average of decode cycles per frame */
+ priv->dfs_decode_cycles_ema = priv->dfs_ema_damping * (double)decode_cycles/bitstream_len +
+ (1.0 - priv->dfs_ema_damping) * priv->dfs_decode_cycles_ema;
+
+ priv->dfs_bitrate_sum += bitstream_len;
+ priv->dfs_cur_sample = (priv->dfs_cur_sample + 1) % priv->dfs_num_samples;
+
+ err = 0;
+
+ /* Reclock if we collected enough samples */
+ if (priv->dfs_cur_sample == 0) {
+ now = av_gettime_relative();
+ wl_dt = now - priv->dfs_sampling_start_ts;
+
+ /*
+ * Try to filter bad sample sets caused by eg. pausing the video playback.
+ * We reject if one of these conditions is met:
+ * - the wall time is over 1.5x the framerate (10Hz is used as fallback if no framerate information is available)
+ * - the wall time is over 1.5x the ema-damped previous values
+ */
+
+ if (priv->framerate >= 0.1 && isfinite(priv->framerate))
+ frame_time = 1.0e6 / priv->framerate;
+ else
+ frame_time = 0.1e6;
+
+ if ((wl_dt < 1.5 * priv->dfs_num_samples * frame_time) ||
+ ((priv->dfs_last_ts_delta) && (wl_dt < 1.5 * priv->dfs_last_ts_delta))) {
+ avg = priv->dfs_bitrate_sum * 1e6 / wl_dt;
+ clock = priv->dfs_decode_cycles_ema * avg * 1.2;
+ clock = FFMAX(clock, priv->dfs_lowcorner);
+
+ av_log(ctx, AV_LOG_DEBUG, "DFS: %.0f cycles/b (ema), %.0f b/s -> clock %u Hz (lowcorner %u Hz)\n",
+ priv->dfs_decode_cycles_ema, avg, clock, priv->dfs_lowcorner);
+
+ err = nvtegra_channel_set_freq(channel, clock);
+
+ priv->dfs_last_ts_delta = wl_dt;
+ }
+
+ priv->dfs_bitrate_sum = 0;
+ priv->dfs_sampling_start_ts = now;
+ }
+
+ return err;
+}
+
+int av_nvtegra_dfs_uninit(AVHWDeviceContext *ctx, AVNVTegraChannel *channel) {
+ return nvtegra_channel_set_freq(channel, 0);
+}
+
static int nvtegra_transfer_get_formats(AVHWFramesContext *ctx,
enum AVHWFrameTransferDirection dir,
enum AVPixelFormat **formats)
diff --git a/libavutil/hwcontext_nvtegra.h b/libavutil/hwcontext_nvtegra.h
index 8a2383d304..7c845951d9 100644
--- a/libavutil/hwcontext_nvtegra.h
+++ b/libavutil/hwcontext_nvtegra.h
@@ -82,4 +82,11 @@ static inline AVNVTegraMap *av_nvtegra_frame_get_fbuf_map(const AVFrame *frame)
*/
int av_nvtegra_pixfmt_to_vic(enum AVPixelFormat fmt);
+/*
+ * Dynamic frequency scaling routines
+ */
+int av_nvtegra_dfs_init(AVHWDeviceContext *ctx, AVNVTegraChannel *channel, int width, int height, double framerate_hz);
+int av_nvtegra_dfs_update(AVHWDeviceContext *ctx, AVNVTegraChannel *channel, int bitstream_len, int decode_cycles);
+int av_nvtegra_dfs_uninit(AVHWDeviceContext *ctx, AVNVTegraChannel *channel);
+
#endif /* AVUTIL_HWCONTEXT_NVTEGRA_H */
--
2.45.1
More information about the ffmpeg-devel
mailing list