[Libav-user] Why does sws_scale runtime depend on calling frequency?

lukas lukas.bommes at gmx.de
Thu May 30 12:53:18 EEST 2019


Dear libav-users,

I wrote a program which extracts motion vectors from a video stream and 
encountered the problem that the runtime of sws_scale changes depending 
on whether I put a sleep command in the main loop of the caller. If no 
sleep command is present, sws_scale returns after ca. 0.9 ms on my 
machine. With a sleep command of arbitrary length (I tested 1ms, 25ms, 
500ms and 1s) the runtime is around 7 ms.

I am using libswscale as shipped in FFMPEG 4.1 and my implementation is 
similar to the code used in OpenCV VideoCapture 
(https://github.com/opencv/opencv/blob/master/modules/videoio/src/cap_ffmpeg_impl.hpp#L431)

I would be glad if someone could provide me with at least an idea of 
what is going wrong here. My code is attached below.

Best regards,

Lukas


// Compile command: g++ -I ~/boost -I /usr/include/python3.6m/ -fpic 
video_cap.cpp -o main -L ~/boost/stage/lib -lboost_python36 
-lboost_numpy36 -lpython3.6m `pkg-config --cflags --libs libavformat 
libswscale opencv4` -Wl,-Bsymbolic

#include <thread>
#include <iostream>
#include <vector>
#include <chrono>


#include <opencv2/opencv.hpp>
#include <opencv2/core/types.hpp>
#include <opencv2/imgproc.hpp>

// FFMPEG
extern "C" {
#include <libavutil/motion_vector.h>
#include <libavformat/avformat.h>
#include <libswscale/swscale.h>
#include <libavutil/imgutils.h>
}

// for changing the dtype of motion vector
#define MVS_DTYPE int32_t
#define MVS_DTYPE_NP NPY_INT32


struct Image_FFMPEG
{
     unsigned char* data;
     int step;
     int width;
     int height;
     int cn;
};


class VideoCap {

private:

     const char *url;
     AVDictionary *opts;
     AVCodec *codec;
     AVFormatContext *fmt_ctx;
     AVCodecContext *video_dec_ctx;
     AVStream *video_stream;
     int video_stream_idx;
     AVFrame *frame;
     AVFrame rgb_frame;
     Image_FFMPEG picture;
     struct SwsContext *img_convert_ctx;


public:

     VideoCap() {
         this->opts = NULL;
         this->codec = NULL;
         this->fmt_ctx = NULL;
         this->video_dec_ctx = NULL;
         this->video_stream = NULL;
         this->video_stream_idx = -1;
         this->frame = NULL;
         this->img_convert_ctx = NULL;

         memset(&(this->rgb_frame), 0, sizeof(this->rgb_frame));
         memset(&(this->picture), 0, sizeof(this->picture));
     }


     void free_all() {
         if (this->img_convert_ctx) {
             sws_freeContext(this->img_convert_ctx);
             this->img_convert_ctx = 0;
         }

         if (this->frame)
             av_frame_free(&(this->frame));

         av_frame_unref(&(this->rgb_frame));

         if (this->video_dec_ctx)
             avcodec_free_context(&(this->video_dec_ctx));

         if (this->fmt_ctx)
             avformat_close_input(&(this->fmt_ctx));
     }


     void release(void) {
         this->free_all();
     }


     bool open(const char *url) {
         this->url = url;
         int ret;

         // open RTSP stream with TCP
         av_dict_set(&(this->opts), "rtsp_transport", "tcp", 0);
         ret = avformat_open_input(&(this->fmt_ctx), url, NULL, 
&(this->opts));
         if (ret < 0) {
             std::cerr << "Could not open source file ' " << url << "'" 
<< std::endl;
             return false;
         }

         // read packets of a media file to get stream information.
         ret = avformat_find_stream_info(this->fmt_ctx, NULL);
         if (ret < 0) {
             std::cerr << "Could not find stream information" << std::endl;
             return false;
         }

         ret = this->open_codec_context(this->fmt_ctx, AVMEDIA_TYPE_VIDEO);
         if (!ret) {
             std::cerr << "Could not create codex context" << std::endl;
             return false;
         }

         // print info (duration, bitrate, streams, container, programs, 
metadata, side data, codec, time base)
         av_dump_format(this->fmt_ctx, 0, url, 0);

         if (!this->video_stream) {
             std::cerr << "Could not find video stream in the input, 
aborting" << std::endl;
             this->free_all();
             return false;
         }

         this->frame = av_frame_alloc();
         if (!this->frame) {
             std::cerr << "Could not allocate frame" << std::endl;
             this->free_all();
             return false;
         }

         return true;
     }


     bool open_codec_context(AVFormatContext *fmt_ctx, enum AVMediaType 
type) {
         // find the most suitable stream of given type (e.g. video) and 
set the codec accordingly
         int ret = av_find_best_stream(fmt_ctx, type, -1, -1, 
&(this->codec), 0);
         if (ret < 0) {
             std::cerr << "Could not find " << 
av_get_media_type_string(type) << " stream in input file '" << this->url 
<< "'" << std::endl;
             return false;
         }
         else {
             // set stream in format context
             this->video_stream_idx = ret;
             AVStream *st = fmt_ctx->streams[this->video_stream_idx];

             // allocate an AVCodecContext and set its fields to default 
values
             this->video_dec_ctx = avcodec_alloc_context3(this->codec);
             if (!this->video_dec_ctx) {
                 std::cerr << "Failed to allocate codec" << std::endl;
                 return false;
             }

             // fill the codec context based on the values from the 
supplied codec parameters
             ret = avcodec_parameters_to_context(this->video_dec_ctx, 
st->codecpar);
             if (ret < 0) {
                 std::cerr << "Failed to copy codec parameters to codec 
context" << std::endl;
                 return false;
             }

             this->video_dec_ctx->thread_count = 
std::thread::hardware_concurrency();
             std::cerr << "Using parallel processing with " << 
this->video_dec_ctx->thread_count << " threads" << std::endl;

             // backup encoder's width/height
             int enc_width = this->video_dec_ctx->width;
             int enc_height = this->video_dec_ctx->height;

             // Init the video decoder with the codec and set additional 
option to extract motion vectors
             av_dict_set(&(this->opts), "flags2", "+export_mvs", 0);
             ret = avcodec_open2(this->video_dec_ctx, this->codec, 
&(this->opts));
             if (ret < 0) {
                 std::cerr << "Failed to open " << 
av_get_media_type_string(type) << " codec" << std::endl;
                 return false;
             }

             this->video_stream = fmt_ctx->streams[this->video_stream_idx];

             // checking width/height (since decoder can sometimes alter 
it, eg. vp6f)
             if (enc_width && (this->video_dec_ctx->width != enc_width)) {
                 this->video_dec_ctx->width = enc_width;
             }
             if (enc_height && (this->video_dec_ctx->height != 
enc_height)) {
                 this->video_dec_ctx->height = enc_height;
             }

             this->picture.width = this->video_dec_ctx->width;
             this->picture.height = this->video_dec_ctx->height;
             this->picture.cn = 3;
             this->picture.step = 0;
             this->picture.data = NULL;
         }

         return true;
     }


     bool read(cv::OutputArray cv_frame, char *frame_type, MVS_DTYPE 
**motion_vectors, MVS_DTYPE *num_mvs) {

         uint8_t* data = 0;
         int step = 0, width = 0, height = 0, cn = 0;

         // loop over different streams (video, audio) in the file
         while(1) {
             AVPacket pkt = { 0 };

             // read next packet from the stream
             int ret = av_read_frame(this->fmt_ctx, &pkt);
             if (ret < 0) {
                 return false;
             }

             // if the packet is not from the video stream don't do 
anything and get next packet
             if (pkt.stream_index != this->video_stream_idx) {
                 continue;
             }
             // if the packet is from the video stream send it to decoder
             else {

                 bool ret = this->decode_packet(&pkt, &data, &step, 
&width, &height, &cn, frame_type, motion_vectors, num_mvs);
                 if (!ret) {
                     return false;
                 }

                 cv::Mat(height, width, CV_MAKETYPE(CV_8U, cn), data, 
step).copyTo(cv_frame);

                 av_packet_unref(&pkt);

                 return true;
             }
         }
     }


     bool frame_to_buffer(uint8_t** data, int* step, int* width, int* 
height, int* cn)
     {
         if (!this->video_stream || !(this->frame->data[0])) {
             return false;
         }

         if (this->img_convert_ctx == NULL ||
             this->picture.width != this->video_dec_ctx->width ||
             this->picture.height != this->video_dec_ctx->height ||
             this->picture.data == NULL) {

             // Some sws_scale optimizations have some assumptions about 
alignment of data/step/width/height
             // Also we use coded_width/height to workaround problem 
with legacy ffmpeg versions (like n0.8)
             int buffer_width = this->video_dec_ctx->coded_width;
             int buffer_height = this->video_dec_ctx->coded_height;

             this->img_convert_ctx = sws_getCachedContext(
                     this->img_convert_ctx,
                     buffer_width, buffer_height,
                     this->video_dec_ctx->pix_fmt,
                     buffer_width, buffer_height,
                     AV_PIX_FMT_BGR24,
                     SWS_BICUBIC,
                     NULL, NULL, NULL
                     );

             if (this->img_convert_ctx == NULL) {
                 std::cerr << "Allocation of conversion context failed" 
<< std::endl;
                 return false;
             }

             av_frame_unref(&(this->rgb_frame));
             this->rgb_frame.format = AV_PIX_FMT_BGR24;
             this->rgb_frame.width = buffer_width;
             this->rgb_frame.height = buffer_height;
             if (0 != av_frame_get_buffer(&(this->rgb_frame), 32)) {
                 std::cerr << "Not enough memory to allocate buffer for 
frame conversion" << std::endl;
                 return false;
             }

             this->picture.width = this->video_dec_ctx->width;
             this->picture.height = this->video_dec_ctx->height;
             this->picture.cn = 3;
             this->picture.data = this->rgb_frame.data[0];
             this->picture.step = this->rgb_frame.linesize[0];
         }

         auto start = std::chrono::high_resolution_clock::now();
         sws_scale(
             this->img_convert_ctx,
             this->frame->data,
             this->frame->linesize,
             0, this->video_dec_ctx->coded_height,
             this->rgb_frame.data,
             this->rgb_frame.linesize
             );
         auto finish = std::chrono::high_resolution_clock::now();
         std::chrono::duration<double> elapsed = finish - start;
         std::cout << "sws_scale (C++): " << elapsed.count() << " s\n";

         *data = this->picture.data;
         *step = this->picture.step;
         *width = this->picture.width;
         *height = this->picture.height;
         *cn = this->picture.cn;

         return true;
     }


     bool decode_packet(const AVPacket *pkt, uint8_t** data, int* step, 
int* width, int* height, int* cn, char *frame_type, MVS_DTYPE 
**motion_vectors, MVS_DTYPE *num_mvs) {
         // send encoded data packet to the decoder
         int ret = avcodec_send_packet(this->video_dec_ctx, pkt);
         if (ret < 0) {
             std::cerr << "Error while sending a packet to the decoder: 
" << ret << std::endl;
             return false;
         }

         // loop over packets until the next frame is fully assembled
         while (ret >= 0)  {
             // try to get the next frame from decoder
             ret = avcodec_receive_frame(this->video_dec_ctx, this->frame);
             // failed: end of stream or no frame available, stop and 
return with success
             if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
                 break;
             }
             // failed: another error occured, return with error
             else if (ret < 0) {
                 std::cerr << "Error while receiving a frame from the 
decoder: " << ret << std::endl;
                 return false;
             }
             // sucessfully decoded new frame, get motion vectors
             else {

                 AVFrameSideData *sd = 
av_frame_get_side_data(this->frame, AV_FRAME_DATA_MOTION_VECTORS);
                 if (sd) {
                     AVMotionVector *mvs = (AVMotionVector *)sd->data;

                     *num_mvs = sd->size / sizeof(*mvs);

                     if (*num_mvs > 0) {

                         // allocate memory for motion vectors as 1D array
                         if (!(*motion_vectors = (MVS_DTYPE *) 
malloc(*num_mvs * 10 * sizeof(MVS_DTYPE)))) {
                             std::cerr << "Memory allocation for motion 
vectors failed." << std::endl;
                             return false;
                         }

                         // store the motion vectors in the allocated 
memory (C contiguous)
                         for (MVS_DTYPE i = 0; i < *num_mvs; ++i) {
                             *(*motion_vectors + i*10     ) = 
static_cast<MVS_DTYPE>(mvs[i].source);
                             *(*motion_vectors + i*10 +  1) = 
static_cast<MVS_DTYPE>(mvs[i].w);
                             *(*motion_vectors + i*10 +  2) = 
static_cast<MVS_DTYPE>(mvs[i].h);
                             *(*motion_vectors + i*10 +  3) = 
static_cast<MVS_DTYPE>(mvs[i].src_x);
                             *(*motion_vectors + i*10 +  4) = 
static_cast<MVS_DTYPE>(mvs[i].src_y);
                             *(*motion_vectors + i*10 +  5) = 
static_cast<MVS_DTYPE>(mvs[i].dst_x);
                             *(*motion_vectors + i*10 +  6) = 
static_cast<MVS_DTYPE>(mvs[i].dst_y);
                             *(*motion_vectors + i*10 +  7) = 
static_cast<MVS_DTYPE>(mvs[i].motion_x);
                             *(*motion_vectors + i*10 +  8) = 
static_cast<MVS_DTYPE>(mvs[i].motion_y);
                             *(*motion_vectors + i*10 +  9) = 
static_cast<MVS_DTYPE>(mvs[i].motion_scale);
                             //*(*motion_vectors + i*11 + 10) = 
static_cast<MVS_DTYPE>(mvs[i].flags);
                         }
                     }
                 }

                 // convert AVFrame to numpy ndarray
                 if(!this->frame_to_buffer(data, step, width, height, cn)) {
                     std::cerr << "Conversion of frame failed." << 
std::endl;
                     return false;
                 }

                 // get frame type (I, P, B, etc.) and create a null 
terminated c-string
                 frame_type[0] = 
av_get_picture_type_char(this->frame->pict_type);
                 frame_type[1] = '\0';
             }
         }

         return true;
     }

};



//##############################################################################
//
//         MAIN
//
//##############################################################################


void draw_motion_vectors(cv::Mat frame, std::vector<AVMotionVector> 
*motion_vectors) {
     for (std::vector<AVMotionVector>::size_type i = 0; i < 
motion_vectors->size(); i++) {
         cv::Point start_pt, end_pt;
         start_pt.y = (*motion_vectors)[i].src_y;
         start_pt.x = (*motion_vectors)[i].src_x;
         end_pt.y = (*motion_vectors)[i].dst_y;
         end_pt.x =  (*motion_vectors)[i].dst_x;
         cv::arrowedLine(frame, start_pt, end_pt, cv::Scalar(0, 0, 255), 
1, cv::LINE_AA, 0, 0.1);
     }
}


int main(int argc, char **argv)
{
     // filename of the video file
     const char *url = "vid.mp4";

     VideoCap cap;

     // open the video file
     bool ret = cap.open(url);
     if (!ret) {
         std::cerr << "Could not open the video url" << std::endl;
         return -1;
     }

     // continuously read and display video frames and motion vectors
     while(1) {

         std::cout << "##########################" << std::endl;

         cv::Mat frame;
         MVS_DTYPE *motion_vectors = NULL;
         MVS_DTYPE num_mvs = 0;
         char frame_type[2] = "?";

         auto start = std::chrono::high_resolution_clock::now();

         // read next video frame and corresponding motion vectors
         bool ret = cap.read(frame, frame_type, &motion_vectors, &num_mvs);

         auto finish = std::chrono::high_resolution_clock::now();
         std::chrono::duration<double> elapsed = finish - start;
         std::cout << "Elapsed time: " << elapsed.count() << " s\n";

         std::chrono::milliseconds timespan(25);
         std::this_thread::sleep_for(timespan);

         // if there is an error reading the frame
         if(!ret) {
             std::cerr << "Could not read the next frame" << std::endl;
             return -1;
         }
         else {

             // if the frame is not empty
             cv::Size s = frame.size();
             if (s.height > 0 && s.width > 0) {

                 // print type of frame (I, P, B, etc)
                 std::cout << "Frame type: " << frame_type << std::endl;

                 // print motion vectors
                 /*for (std::vector<AVMotionVector>::size_type i = 0; i 
< motion_vectors.size(); i++) {
                     std::cout << std::setw(7) << "src: " << 
motion_vectors[i].source
                             << std::setw(6) << "w: " << 
static_cast<int16_t>(motion_vectors[i].w)
                             << std::setw(6) << "h: " << 
static_cast<int16_t>(motion_vectors[i].h)
                             << std::setw(10) << "src_x: " << 
motion_vectors[i].src_x
                             << std::setw(10) << "src_y: " << 
motion_vectors[i].src_y
                             << std::setw(10) << "dst_x: " << 
motion_vectors[i].dst_x
                             << std::setw(10) << "dst_y: " << 
motion_vectors[i].dst_y
                             << std::setw(10) << "mot_x: " << 
motion_vectors[i].motion_x
                             << std::setw(12) << "mot_y: " << 
motion_vectors[i].motion_y
                             << std::setw(12) << "mot_scl: " << 
motion_vectors[i].motion_scale
                             << std::setw(9) << "flags: " << 
motion_vectors[i].flags << std::endl;
                 }*/

                 //draw_motion_vectors(frame, &motion_vectors);

                 // show frame
                 cv::imshow("Frame", frame);

                 if (motion_vectors)
                     free(motion_vectors);
                     motion_vectors = NULL;

                 // if user presses "ESC" stop program
                 char c=(char)cv::waitKey(1);
                 if(c==27) {
                     break;
                 }
             }
         }
     }


     // when everything done, release the video capture object
     cap.release();

     // close the GUI window
     cv::destroyAllWindows();

     return 0;
}



More information about the Libav-user mailing list