[libav-devel] [PATCH] avcodec/qsv: fix async support

Discussion:

Dmitry Rogozhkin

2018-07-24 17:36:19 UTC

Current implementations of qsv components incorrectly work with async level, they
actually try to work in async+1 level stepping into MFX_WRN_DEVICE_BUSY and polling
loop. This change address this misbehaviour.

Signed-off-by: Dmitry Rogozhkin <***@intel.com>
Cc: Maxym Dmytrychenko <***@gmail.com>
Cc: Zhong Li <***@intel.com>
---
libavcodec/qsvdec.c | 15 ++++++++++++---
libavcodec/qsvenc.c | 17 +++++++++++++----
2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/libavcodec/qsvdec.c b/libavcodec/qsvdec.c
index 32f1fe7..b9707f7 100644
--- a/libavcodec/qsvdec.c
+++ b/libavcodec/qsvdec.c
@@ -110,6 +110,16 @@ static int qsv_init_session(AVCodecContext *avctx, QSVContext *q, mfxSession ses
return 0;
}

+static inline unsigned int qsv_fifo_item_size(void)
+{
+ return sizeof(mfxSyncPoint*) + sizeof(QSVFrame*);
+}
+
+static inline unsigned int qsv_fifo_size(const AVFifoBuffer* fifo)
+{
+ return av_fifo_size(fifo)/qsv_fifo_item_size();
+}
+
static int qsv_decode_init(AVCodecContext *avctx, QSVContext *q)
{
const AVPixFmtDescriptor *desc;
@@ -125,8 +135,7 @@ static int qsv_decode_init(AVCodecContext *avctx, QSVContext *q)
return AVERROR_BUG;

if (!q->async_fifo) {
- q->async_fifo = av_fifo_alloc((1 + q->async_depth) *
- (sizeof(mfxSyncPoint*) + sizeof(QSVFrame*)));
+ q->async_fifo = av_fifo_alloc(q->async_depth * qsv_fifo_item_size());
if (!q->async_fifo)
return AVERROR(ENOMEM);
}
@@ -384,7 +393,7 @@ static int qsv_decode(AVCodecContext *avctx, QSVContext *q,
av_freep(&sync);
}

- if (!av_fifo_space(q->async_fifo) ||
+ if ((qsv_fifo_size(q->async_fifo) >= q->async_depth) ||
(!avpkt->size && av_fifo_size(q->async_fifo))) {
AVFrame *src_frame;

diff --git a/libavcodec/qsvenc.c b/libavcodec/qsvenc.c
index 3ce5ffe..40ddb34 100644
--- a/libavcodec/qsvenc.c
+++ b/libavcodec/qsvenc.c
@@ -777,7 +777,7 @@ static int qsv_init_opaque_alloc(AVCodecContext *avctx, QSVEncContext *q)
mfxFrameSurface1 *surfaces;
int nb_surfaces, i;

- nb_surfaces = qsv->nb_opaque_surfaces + q->req.NumFrameSuggested + q->async_depth;
+ nb_surfaces = qsv->nb_opaque_surfaces + q->req.NumFrameSuggested;

q->opaque_alloc_buf = av_buffer_allocz(sizeof(*surfaces) * nb_surfaces);
if (!q->opaque_alloc_buf)
@@ -848,6 +848,16 @@ static int qsvenc_init_session(AVCodecContext *avctx, QSVEncContext *q)
return 0;
}

+static inline unsigned int qsv_fifo_item_size(void)
+{
+ return sizeof(AVPacket) + sizeof(mfxSyncPoint*) + sizeof(mfxBitstream*);
+}
+
+static inline unsigned int qsv_fifo_size(const AVFifoBuffer* fifo)
+{
+ return av_fifo_size(fifo)/qsv_fifo_item_size();
+}
+
int ff_qsv_enc_init(AVCodecContext *avctx, QSVEncContext *q)
{
int iopattern = 0;
@@ -856,8 +866,7 @@ int ff_qsv_enc_init(AVCodecContext *avctx, QSVEncContext *q)

q->param.AsyncDepth = q->async_depth;

- q->async_fifo = av_fifo_alloc((1 + q->async_depth) *
- (sizeof(AVPacket) + sizeof(mfxSyncPoint*) + sizeof(mfxBitstream*)));
+ q->async_fifo = av_fifo_alloc(q->async_depth * qsv_fifo_item_size());
if (!q->async_fifo)
return AVERROR(ENOMEM);

@@ -1214,7 +1223,7 @@ int ff_qsv_encode(AVCodecContext *avctx, QSVEncContext *q,
if (ret < 0)
return ret;

- if (!av_fifo_space(q->async_fifo) ||
+ if ((qsv_fifo_size(q->async_fifo) >= q->async_depth) ||
(!frame && av_fifo_size(q->async_fifo))) {
AVPacket new_pkt;
mfxBitstream *bs;

--
1.8.3.1

Maxym Dmytrychenko

2018-07-25 13:29:44 UTC

Permalink

On Wed, Jul 25, 2018 at 3:39 AM Dmitry Rogozhkin <

Post by Dmitry Rogozhkin
Current implementations of qsv components incorrectly work with async level, they
actually try to work in async+1 level stepping into MFX_WRN_DEVICE_BUSY and polling
loop. This change address this misbehaviour.
---
libavcodec/qsvdec.c | 15 ++++++++++++---
libavcodec/qsvenc.c | 17 +++++++++++++----
2 files changed, 25 insertions(+), 7 deletions(-)
diff --git a/libavcodec/qsvdec.c b/libavcodec/qsvdec.c
index 32f1fe7..b9707f7 100644
--- a/libavcodec/qsvdec.c
+++ b/libavcodec/qsvdec.c
@@ -110,6 +110,16 @@ static int qsv_init_session(AVCodecContext *avctx,
QSVContext *q, mfxSession ses
return 0;
}
+static inline unsigned int qsv_fifo_item_size(void)
+{
+ return sizeof(mfxSyncPoint*) + sizeof(QSVFrame*);
+}
+
+static inline unsigned int qsv_fifo_size(const AVFifoBuffer* fifo)
+{
+ return av_fifo_size(fifo)/qsv_fifo_item_size();
+}
+
static int qsv_decode_init(AVCodecContext *avctx, QSVContext *q)
{
const AVPixFmtDescriptor *desc;
@@ -125,8 +135,7 @@ static int qsv_decode_init(AVCodecContext *avctx, QSVContext *q)
return AVERROR_BUG;
if (!q->async_fifo) {
- q->async_fifo = av_fifo_alloc((1 + q->async_depth) *
- (sizeof(mfxSyncPoint*) + sizeof(QSVFrame*)));
+ q->async_fifo = av_fifo_alloc(q->async_depth *
qsv_fifo_item_size());
if (!q->async_fifo)
return AVERROR(ENOMEM);
}
@@ -384,7 +393,7 @@ static int qsv_decode(AVCodecContext *avctx, QSVContext *q,
av_freep(&sync);
}
- if (!av_fifo_space(q->async_fifo) ||
+ if ((qsv_fifo_size(q->async_fifo) >= q->async_depth) ||
(!avpkt->size && av_fifo_size(q->async_fifo))) {
AVFrame *src_frame;
diff --git a/libavcodec/qsvenc.c b/libavcodec/qsvenc.c
index 3ce5ffe..40ddb34 100644
--- a/libavcodec/qsvenc.c
+++ b/libavcodec/qsvenc.c
@@ -777,7 +777,7 @@ static int qsv_init_opaque_alloc(AVCodecContext
*avctx, QSVEncContext *q)
mfxFrameSurface1 *surfaces;
int nb_surfaces, i;
- nb_surfaces = qsv->nb_opaque_surfaces + q->req.NumFrameSuggested + q->async_depth;
+ nb_surfaces = qsv->nb_opaque_surfaces + q->req.NumFrameSuggested;
q->opaque_alloc_buf = av_buffer_allocz(sizeof(*surfaces) * nb_surfaces);
if (!q->opaque_alloc_buf)
@@ -848,6 +848,16 @@ static int qsvenc_init_session(AVCodecContext *avctx,
QSVEncContext *q)
return 0;
}
+static inline unsigned int qsv_fifo_item_size(void)
+{
+ return sizeof(AVPacket) + sizeof(mfxSyncPoint*) +
sizeof(mfxBitstream*);
+}
+
+static inline unsigned int qsv_fifo_size(const AVFifoBuffer* fifo)
+{
+ return av_fifo_size(fifo)/qsv_fifo_item_size();
+}
+
int ff_qsv_enc_init(AVCodecContext *avctx, QSVEncContext *q)
{
int iopattern = 0;
@@ -856,8 +866,7 @@ int ff_qsv_enc_init(AVCodecContext *avctx,
QSVEncContext *q)
q->param.AsyncDepth = q->async_depth;
- q->async_fifo = av_fifo_alloc((1 + q->async_depth) *
- (sizeof(AVPacket) +
sizeof(mfxSyncPoint*) + sizeof(mfxBitstream*)));
+ q->async_fifo = av_fifo_alloc(q->async_depth * qsv_fifo_item_size());
if (!q->async_fifo)
return AVERROR(ENOMEM);
@@ -1214,7 +1223,7 @@ int ff_qsv_encode(AVCodecContext *avctx,
QSVEncContext *q,
if (ret < 0)
return ret;
- if (!av_fifo_space(q->async_fifo) ||
+ if ((qsv_fifo_size(q->async_fifo) >= q->async_depth) ||
(!frame && av_fifo_size(q->async_fifo))) {
AVPacket new_pkt;
mfxBitstream *bs;
--
1.8.3.1

thanks for the patch.
any performance impact you see, depth == 1 and higher ?

our tests shows some drop when async_depth == 1.

regards
Max

Rogozhkin, Dmitry V

2018-07-25 17:39:51 UTC

Permalink

On Wed, 2018-07-25 at 15:29 +0200, Maxym Dmytrychenko wrote:
thanks for the patch.
any performance impact you see, depth == 1 and higher ?

our tests shows some drop when async_depth == 1.

You indeed can notice some FPS drop on async=~1. This is due to the fact that ffmpeg tried to work on async+1 and it succeeded to some extent. What ffmpeg did was a polling loop on data submission (instead of a sleep on SyncOperation) which means that it gained some time and pushed data more instantly on the appearance of the empty slot. Since the patch removes this instant pushing you see some performance drops which I would actually consider as alignments to the use cases: if user wants synchronous pipeline (async=1) - that's for the reason, like in video conferencing. Well, returning to the performance story. So, there were some gains in async+1 approach. However, there were few important drawbacks on which we need to pay attention:

1. Such polling loop comes with the excessive CPU%. You can notice it if you will consider multiple parallel transcoding, like here:
# cat ./run.sh
for i in `seq 1 8`; do
ffmpeg -y -hwaccel qsv -c:v h264_qsv -i JM_BQTerrace_1920x1080_60_22.h264 \
-c:v h264_qsv -preset medium -profile:v high -level:v 51 -g 50 -bf 3 -async_depth 1 -b:v 16M -minrate 16M -maxrate 16M qsv_$i.264 &
done
I for example have on 8-cores system (/usr/bin/time -f "%e;%U;%S;%P" ./run.sh):
17.62;10.64;3.22;78% # before the patch
17.87;9.44;2.57;67% # after the patch
And the impact will grow with the decrease of the resolution.

2. Another drawback which I am not sure how to measure on ffmpeg is latency impact. Which is that the polling loop on async+1 was not an ultimate polling, instead you had usleep. Thus, you actually impacted latency of the output frames flow since you introduced unpredicatable sleep up to your sleep interval. After the change fluctuations in latency should go away.

So, I encourage to embrace this change and understand that there are some changes in performance associated with it which aligns the behavior with the expected usage models.

Luca Barbato

2018-07-25 20:23:55 UTC

Permalink

Post by Rogozhkin, Dmitry V
So, I encourage to embrace this change and understand that there are
some changes in performance associated with it which aligns the
behavior with the expected usage models.

I guess we could change the default so it is less surprising.

lu

Maxym Dmytrychenko

2018-07-25 20:36:53 UTC

Permalink

Post by Luca Barbato

I guess we could change the default so it is less surprising.
lu
_______________________________________________
libav-devel mailing list
https://lists.libav.org/mailman/listinfo/libav-devel

ok, we can skip performance impact in favor of logical correctness

Rogozhkin, Dmitry V

2018-07-25 22:41:29 UTC

Permalink

Post by Maxym Dmytrychenko

Post by Luca Barbato

I guess we could change the default so it is less surprising.

The default is async=4, and nothing should change much for FPS for this
case. Or you meant something else?

Post by Maxym Dmytrychenko

Post by Luca Barbato
lu
_______________________________________________
libav-devel mailing list
https://lists.libav.org/mailman/listinfo/libav-devel

ok, we can skip performance impact in favor of logical correctness
_______________________________________________
libav-devel mailing list
https://lists.libav.org/mailman/listinfo/libav-devel

Li, Zhong

2018-07-26 05:55:35 UTC

Permalink

-----Original Message-----
From: Rogozhkin, Dmitry V
Sent: Wednesday, July 25, 2018 1:36 AM
Subject: [PATCH] avcodec/qsv: fix async support
Current implementations of qsv components incorrectly work with async
level, they actually try to work in async+1 level stepping into
MFX_WRN_DEVICE_BUSY and polling loop. This change address this
misbehaviour.
---
libavcodec/qsvdec.c | 15 ++++++++++++--- libavcodec/qsvenc.c | 17
+++++++++++++----
2 files changed, 25 insertions(+), 7 deletions(-)
diff --git a/libavcodec/qsvdec.c b/libavcodec/qsvdec.c index
32f1fe7..b9707f7 100644
--- a/libavcodec/qsvdec.c
+++ b/libavcodec/qsvdec.c
@@ -110,6 +110,16 @@ static int qsv_init_session(AVCodecContext *avctx,
QSVContext *q, mfxSession ses
return 0;
}
+static inline unsigned int qsv_fifo_item_size(void) {
+ return sizeof(mfxSyncPoint*) + sizeof(QSVFrame*); }
+
+static inline unsigned int qsv_fifo_size(const AVFifoBuffer* fifo) {
+ return av_fifo_size(fifo)/qsv_fifo_item_size();
+}
+
static int qsv_decode_init(AVCodecContext *avctx, QSVContext *q) {
const AVPixFmtDescriptor *desc;
@@ -125,8 +135,7 @@ static int qsv_decode_init(AVCodecContext *avctx, QSVContext *q)
return AVERROR_BUG;
if (!q->async_fifo) {
- q->async_fifo = av_fifo_alloc((1 + q->async_depth) *
- (sizeof(mfxSyncPoint*) + sizeof(QSVFrame*)));
+ q->async_fifo = av_fifo_alloc(q->async_depth *
+ qsv_fifo_item_size());
if (!q->async_fifo)
return AVERROR(ENOMEM);
}
@@ -384,7 +393,7 @@ static int qsv_decode(AVCodecContext *avctx, QSVContext *q,
av_freep(&sync);
}
- if (!av_fifo_space(q->async_fifo) ||
+ if ((qsv_fifo_size(q->async_fifo) >= q->async_depth) ||
(!avpkt->size && av_fifo_size(q->async_fifo))) {
AVFrame *src_frame;
diff --git a/libavcodec/qsvenc.c b/libavcodec/qsvenc.c index
3ce5ffe..40ddb34 100644
--- a/libavcodec/qsvenc.c
+++ b/libavcodec/qsvenc.c
@@ -777,7 +777,7 @@ static int qsv_init_opaque_alloc(AVCodecContext
*avctx, QSVEncContext *q)
mfxFrameSurface1 *surfaces;
int nb_surfaces, i;
- nb_surfaces = qsv->nb_opaque_surfaces +
q->req.NumFrameSuggested + q->async_depth;
+ nb_surfaces = qsv->nb_opaque_surfaces +
q->req.NumFrameSuggested;
q->opaque_alloc_buf = av_buffer_allocz(sizeof(*surfaces) * nb_surfaces);
if (!q->opaque_alloc_buf)
@@ -848,6 +848,16 @@ static int qsvenc_init_session(AVCodecContext
*avctx, QSVEncContext *q)
return 0;
}
+static inline unsigned int qsv_fifo_item_size(void) {
+ return sizeof(AVPacket) + sizeof(mfxSyncPoint*) +
+sizeof(mfxBitstream*); }
+
+static inline unsigned int qsv_fifo_size(const AVFifoBuffer* fifo) {
+ return av_fifo_size(fifo)/qsv_fifo_item_size();
+}
+

Add a blank before and after "/" is a unified coding style.
Maybe better to move it to qsv.c since it is common for qsvdec/enc.

int ff_qsv_enc_init(AVCodecContext *avctx, QSVEncContext *q) {
int iopattern = 0;
@@ -856,8 +866,7 @@ int ff_qsv_enc_init(AVCodecContext *avctx,
QSVEncContext *q)
q->param.AsyncDepth = q->async_depth;
- q->async_fifo = av_fifo_alloc((1 + q->async_depth) *
- (sizeof(AVPacket) +
sizeof(mfxSyncPoint*) + sizeof(mfxBitstream*)));
+ q->async_fifo = av_fifo_alloc(q->async_depth *
+ qsv_fifo_item_size());

if (!q->async_fifo)
return AVERROR(ENOMEM);
@@ -1214,7 +1223,7 @@ int ff_qsv_encode(AVCodecContext *avctx,
QSVEncContext *q,
if (ret < 0)
return ret;
- if (!av_fifo_space(q->async_fifo) ||
+ if ((qsv_fifo_size(q->async_fifo) >= q->async_depth) ||
(!frame && av_fifo_size(q->async_fifo))) {
AVPacket new_pkt;
mfxBitstream *bs;
--
1.8.3.1

Maxym Dmytrychenko

2018-07-26 07:44:45 UTC

Permalink

Post by Li, Zhong

Add a blank before and after "/" is a unified coding style.
Maybe better to move it to qsv.c since it is common for qsvdec/enc.

good point - agree.

Post by Li, Zhong

int ff_qsv_enc_init(AVCodecContext *avctx, QSVEncContext *q) {
int iopattern = 0;
@@ -856,8 +866,7 @@ int ff_qsv_enc_init(AVCodecContext *avctx, QSVEncContext *q)
q->param.AsyncDepth = q->async_depth;
- q->async_fifo = av_fifo_alloc((1 + q->async_depth) *
- (sizeof(AVPacket) +
sizeof(mfxSyncPoint*) + sizeof(mfxBitstream*)));
+ q->async_fifo = av_fifo_alloc(q->async_depth *
+ qsv_fifo_item_size());

http://ffmpeg.org/pipermail/ffmpeg-devel/2017-November/219643.html
Any historic reason to add "+1" in the initial implementation? If no, I
would like to see this patch applied.

just a minor reason.

Post by Li, Zhong
BTW, currently the option "async_depth" range is 0 ~ INT_MAX. Need to
change it to 1 ~ INT_MAX now, no?

yes, will adjust this as well

Post by Li, Zhong
if (!q->async_fifo)

return AVERROR(ENOMEM);
@@ -1214,7 +1223,7 @@ int ff_qsv_encode(AVCodecContext *avctx, QSVEncContext *q,
if (ret < 0)
return ret;
- if (!av_fifo_space(q->async_fifo) ||
+ if ((qsv_fifo_size(q->async_fifo) >= q->async_depth) ||
(!frame && av_fifo_size(q->async_fifo))) {
AVPacket new_pkt;
mfxBitstream *bs;
--
1.8.3.1

Rogozhkin, Dmitry V

2018-07-26 19:13:31 UTC

Permalink

Add a blank before and after "/" is a unified coding style.
Maybe better to move it to qsv.c since it is common for qsvdec/enc.

good point - agree.

I uploaded v2 of the patch. I added spaces, but I did not move the qsv_fifo_size to the common place. Indeed it is the same between dec/enc, but qsv_fifo_item_size is not the same (sizeof queue objects are different). Thus, I can move only qsv_fifo_size and rely on the symbol resolution that qsv_fifo_item_size will be found. I don't think this is a good idea. So, I left as is, but fixed white space.

I agree with remove the "+1". And noted someone was also confused too: http://ffmpeg.org/pipermail/ffmpeg-devel/2017-November/219643.html
Any historic reason to add "+1" in the initial implementation? If no, I would like to see this patch applied.

just a minor reason.

BTW, currently the option "async_depth" range is 0 ~ INT_MAX. Need to change it to 1 ~ INT_MAX now, no?

yes, will adjust this as well

I adjusted the range. Actually this can be handled other way round. If you pass AsyncDepth=0 to the mediasdk it will correct this parameter and use an internal default (I think =5). Application can query this value with either Query or GetVideoParam after component Init. However, I don't think this is a good idea to have 2 level of defaults, one on ffmpeg level, another on MediaSDK level. Thus, I suggest to consider async_depth>=1 and have ffmpeg level default (=4 at the moment).