[FFmpeg-devel] [PATCH 4/6 v3] avformat/mov: add support for Immersive Audio Model and Formats in ISOBMFF
James Almer
jamrial at gmail.com
Wed Jan 31 19:26:52 EET 2024
Signed-off-by: James Almer <jamrial at gmail.com>
---
configure | 2 +-
libavformat/Makefile | 3 +-
libavformat/isom.h | 6 +
libavformat/mov.c | 283 ++++++++++++++++++++++++++++++++++++++++---
4 files changed, 276 insertions(+), 18 deletions(-)
diff --git a/configure b/configure
index 68f675a4bc..42ba5ec502 100755
--- a/configure
+++ b/configure
@@ -3545,7 +3545,7 @@ matroska_demuxer_suggest="bzlib zlib"
matroska_muxer_select="mpeg4audio riffenc aac_adtstoasc_bsf pgs_frame_merge_bsf vp9_superframe_bsf"
mlp_demuxer_select="mlp_parser"
mmf_muxer_select="riffenc"
-mov_demuxer_select="iso_media riffdec"
+mov_demuxer_select="iso_media riffdec iamf_frame_split_bsf"
mov_demuxer_suggest="zlib"
mov_muxer_select="iso_media riffenc rtpenc_chain vp9_superframe_bsf aac_adtstoasc_bsf ac3_parser"
mp3_demuxer_select="mpegaudio_parser"
diff --git a/libavformat/Makefile b/libavformat/Makefile
index 05b9b8a115..4131279e69 100644
--- a/libavformat/Makefile
+++ b/libavformat/Makefile
@@ -364,7 +364,8 @@ OBJS-$(CONFIG_MMF_MUXER) += mmf.o rawenc.o
OBJS-$(CONFIG_MODS_DEMUXER) += mods.o
OBJS-$(CONFIG_MOFLEX_DEMUXER) += moflex.o
OBJS-$(CONFIG_MOV_DEMUXER) += mov.o mov_chan.o mov_esds.o \
- qtpalette.o replaygain.o dovi_isom.o
+ qtpalette.o replaygain.o dovi_isom.o \
+ iamf.o
OBJS-$(CONFIG_MOV_MUXER) += movenc.o av1.o avc.o hevc.o vvc.o vpcc.o \
movenchint.o mov_chan.o rtp.o \
movenccenc.o movenc_ttml.o rawutils.o \
diff --git a/libavformat/isom.h b/libavformat/isom.h
index a4925b3b08..9f202b3d73 100644
--- a/libavformat/isom.h
+++ b/libavformat/isom.h
@@ -33,6 +33,7 @@
#include "libavutil/stereo3d.h"
#include "avio.h"
+#include "iamf.h"
#include "internal.h"
#include "dv.h"
@@ -167,6 +168,7 @@ typedef struct MOVStreamContext {
AVIOContext *pb;
int refcount;
int pb_is_copied;
+ int id; ///< AVStream id
int ffindex; ///< AVStream index
int next_chunk;
unsigned int chunk_count;
@@ -261,6 +263,10 @@ typedef struct MOVStreamContext {
AVEncryptionInfo *default_encrypted_sample;
MOVEncryptionIndex *encryption_index;
} cenc;
+
+ IAMFContext *iamf;
+ uint8_t *iamf_descriptors;
+ int iamf_descriptors_size;
} MOVStreamContext;
typedef struct HEIFItem {
diff --git a/libavformat/mov.c b/libavformat/mov.c
index 555c72dc79..981a48e074 100644
--- a/libavformat/mov.c
+++ b/libavformat/mov.c
@@ -58,6 +58,7 @@
#include "internal.h"
#include "avio_internal.h"
#include "demux.h"
+#include "iamf_parse.h"
#include "dovi_isom.h"
#include "riff.h"
#include "isom.h"
@@ -211,6 +212,7 @@ static int mov_read_covr(MOVContext *c, AVIOContext *pb, int type, int len)
}
st = c->fc->streams[c->fc->nb_streams - 1];
st->priv_data = sc;
+ sc->id = st->id;
sc->refcount = 1;
if (st->attached_pic.size >= 8 && id != AV_CODEC_ID_BMP) {
@@ -835,6 +837,178 @@ static int mov_read_dac3(MOVContext *c, AVIOContext *pb, MOVAtom atom)
return 0;
}
+static int mov_read_iacb(MOVContext *c, AVIOContext *pb, MOVAtom atom)
+{
+ AVStream *st;
+ MOVStreamContext *sc;
+ FFIOContext b;
+ AVIOContext *descriptor_pb;
+ AVDictionary *metadata;
+ IAMFContext *iamf;
+ char args[32];
+ int64_t start_time, duration;
+ unsigned size;
+ int nb_frames, disposition;
+ int version, ret;
+
+ if (atom.size < 5)
+ return AVERROR_INVALIDDATA;
+
+ if (c->fc->nb_streams < 1)
+ return 0;
+
+ version = avio_r8(pb);
+ if (version != 1) {
+ av_log(c->fc, AV_LOG_ERROR, "%s configurationVersion %d",
+ version < 1 ? "invalid" : "unsupported", version);
+ return AVERROR_INVALIDDATA;
+ }
+
+ size = ffio_read_leb(pb);
+ if (!size)
+ return AVERROR_INVALIDDATA;
+
+ st = c->fc->streams[c->fc->nb_streams - 1];
+ sc = st->priv_data;
+
+ iamf = sc->iamf = av_mallocz(sizeof(*iamf));
+ if (!iamf)
+ return AVERROR(ENOMEM);
+
+ sc->iamf_descriptors = av_malloc(size);
+ if (!sc->iamf_descriptors)
+ return AVERROR(ENOMEM);
+
+ sc->iamf_descriptors_size = size;
+ ret = avio_read(pb, sc->iamf_descriptors, size);
+ if (ret != size)
+ return ret < 0 ? ret : AVERROR_INVALIDDATA;
+
+ ffio_init_read_context(&b, sc->iamf_descriptors, size);
+ descriptor_pb = &b.pub;
+
+ ret = ff_iamfdec_read_descriptors(iamf, descriptor_pb, size, c->fc);
+ if (ret < 0)
+ return ret;
+
+ metadata = st->metadata;
+ st->metadata = NULL;
+ start_time = st->start_time;
+ nb_frames = st->nb_frames;
+ duration = st->duration;
+ disposition = st->disposition;
+
+ for (int i = 0; i < iamf->nb_audio_elements; i++) {
+ IAMFAudioElement *audio_element = iamf->audio_elements[i];
+ AVStreamGroup *stg =
+ avformat_stream_group_create(c->fc, AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT, NULL);
+
+ if (!stg) {
+ ret = AVERROR(ENOMEM);
+ goto fail;
+ }
+
+ stg->id = audio_element->audio_element_id;
+ stg->params.iamf_audio_element = audio_element->element;
+ audio_element->element = NULL;
+
+ for (int j = 0; j < audio_element->nb_substreams; j++) {
+ IAMFSubStream *substream = &audio_element->substreams[j];
+ AVStream *stream;
+
+ if (!i && !j)
+ stream = st;
+ else
+ stream = avformat_new_stream(c->fc, NULL);
+ if (!stream) {
+ ret = AVERROR(ENOMEM);
+ goto fail;
+ }
+
+ stream->start_time = start_time;
+ stream->nb_frames = nb_frames;
+ stream->duration = duration;
+ stream->disposition = disposition;
+ if (stream != st) {
+ stream->priv_data = sc;
+ sc->refcount++;
+ }
+
+ ret = avcodec_parameters_copy(stream->codecpar, substream->codecpar);
+ if (ret < 0)
+ goto fail;
+
+ stream->id = substream->audio_substream_id;
+
+ avpriv_set_pts_info(st, 64, 1, sc->time_scale);
+
+ ret = avformat_stream_group_add_stream(stg, stream);
+ if (ret < 0)
+ goto fail;
+ }
+
+ ret = av_dict_copy(&stg->metadata, metadata, 0);
+ if (ret < 0)
+ goto fail;
+ }
+
+ for (int i = 0; i < iamf->nb_mix_presentations; i++) {
+ IAMFMixPresentation *mix_presentation = iamf->mix_presentations[i];
+ const AVIAMFMixPresentation *mix = mix_presentation->mix;
+ AVStreamGroup *stg =
+ avformat_stream_group_create(c->fc, AV_STREAM_GROUP_PARAMS_IAMF_MIX_PRESENTATION, NULL);
+
+ if (!stg) {
+ ret = AVERROR(ENOMEM);
+ goto fail;
+ }
+
+ stg->id = mix_presentation->mix_presentation_id;
+ stg->params.iamf_mix_presentation = mix_presentation->mix;
+ mix_presentation->mix = NULL;
+
+ for (int j = 0; j < mix->nb_submixes; j++) {
+ const AVIAMFSubmix *submix = mix->submixes[j];
+
+ for (int k = 0; k < submix->nb_elements; k++) {
+ const AVIAMFSubmixElement *submix_element = submix->elements[k];
+ const AVStreamGroup *audio_element = NULL;
+
+ for (int l = 0; l < c->fc->nb_stream_groups; l++)
+ if (c->fc->stream_groups[l]->type == AV_STREAM_GROUP_PARAMS_IAMF_AUDIO_ELEMENT &&
+ c->fc->stream_groups[l]->id == submix_element->audio_element_id) {
+ audio_element = c->fc->stream_groups[l];
+ break;
+ }
+ av_assert0(audio_element);
+
+ for (int l = 0; l < audio_element->nb_streams; l++) {
+ ret = avformat_stream_group_add_stream(stg, audio_element->streams[l]);
+ if (ret < 0 && ret != AVERROR(EEXIST))
+ goto fail;
+ }
+ }
+ }
+
+ ret = av_dict_copy(&stg->metadata, metadata, 0);
+ if (ret < 0)
+ goto fail;
+ }
+
+ snprintf(args, sizeof(args), "first_index=%d", st->index);
+
+ ret = ff_stream_add_bitstream_filter(st, "iamf_frame_split", args);
+ if (ret == AVERROR_BSF_NOT_FOUND) {
+ av_log(c->fc, AV_LOG_ERROR, "iamf_frame_split bitstream filter "
+ "not found. This is a bug, please report it.\n");
+ ret = AVERROR_BUG;
+ }
+fail:
+ av_dict_free(&metadata);
+
+ return ret;
+}
+
static int mov_read_dec3(MOVContext *c, AVIOContext *pb, MOVAtom atom)
{
AVStream *st;
@@ -1380,7 +1554,7 @@ static int64_t get_frag_time(AVFormatContext *s, AVStream *dst_st,
// If the stream is referenced by any sidx, limit the search
// to fragments that referenced this stream in the sidx
if (sc->has_sidx) {
- frag_stream_info = get_frag_stream_info(frag_index, index, dst_st->id);
+ frag_stream_info = get_frag_stream_info(frag_index, index, sc->id);
if (frag_stream_info->sidx_pts != AV_NOPTS_VALUE)
return frag_stream_info->sidx_pts;
if (frag_stream_info->first_tfra_pts != AV_NOPTS_VALUE)
@@ -1391,9 +1565,11 @@ static int64_t get_frag_time(AVFormatContext *s, AVStream *dst_st,
for (i = 0; i < frag_index->item[index].nb_stream_info; i++) {
AVStream *frag_stream = NULL;
frag_stream_info = &frag_index->item[index].stream_info[i];
- for (j = 0; j < s->nb_streams; j++)
- if (s->streams[j]->id == frag_stream_info->id)
+ for (j = 0; j < s->nb_streams; j++) {
+ MOVStreamContext *sc2 = s->streams[j]->priv_data;
+ if (sc2->id == frag_stream_info->id)
frag_stream = s->streams[j];
+ }
if (!frag_stream) {
av_log(s, AV_LOG_WARNING, "No stream matching sidx ID found.\n");
continue;
@@ -1459,12 +1635,13 @@ static int update_frag_index(MOVContext *c, int64_t offset)
for (i = 0; i < c->fc->nb_streams; i++) {
// Avoid building frag index if streams lack track id.
- if (c->fc->streams[i]->id < 0) {
+ MOVStreamContext *sc = c->fc->streams[i]->priv_data;
+ if (sc->id < 0) {
av_free(frag_stream_info);
return AVERROR_INVALIDDATA;
}
- frag_stream_info[i].id = c->fc->streams[i]->id;
+ frag_stream_info[i].id = sc->id;
frag_stream_info[i].sidx_pts = AV_NOPTS_VALUE;
frag_stream_info[i].tfdt_dts = AV_NOPTS_VALUE;
frag_stream_info[i].next_trun_dts = AV_NOPTS_VALUE;
@@ -3259,7 +3436,7 @@ static int mov_read_stts(MOVContext *c, AVIOContext *pb, MOVAtom atom)
"All samples in data stream index:id [%d:%d] have zero "
"duration, stream set to be discarded by default. Override "
"using AVStream->discard or -discard for ffmpeg command.\n",
- st->index, st->id);
+ st->index, sc->id);
st->discard = AVDISCARD_ALL;
}
sc->track_end = duration;
@@ -4639,6 +4816,50 @@ static void fix_timescale(MOVContext *c, MOVStreamContext *sc)
}
}
+static int mov_update_iamf_streams(MOVContext *c, const AVStream *st)
+{
+ const MOVStreamContext *sc = st->priv_data;
+
+ for (int i = 0; i < sc->iamf->nb_audio_elements; i++) {
+ const AVStreamGroup *stg = NULL;
+
+ for (int j = 0; j < c->fc->nb_stream_groups; j++)
+ if (c->fc->stream_groups[j]->id == sc->iamf->audio_elements[i]->audio_element_id)
+ stg = c->fc->stream_groups[j];
+ av_assert0(stg);
+
+ for (int j = 0; j < stg->nb_streams; j++) {
+ const FFStream *sti = cffstream(st);
+ AVStream *out = stg->streams[j];
+ FFStream *out_sti = ffstream(stg->streams[j]);
+
+ out->codecpar->bit_rate = 0;
+
+ if (out == st)
+ continue;
+
+ out->time_base = st->time_base;
+ out->start_time = st->start_time;
+ out->duration = st->duration;
+ out->nb_frames = st->nb_frames;
+ out->disposition = st->disposition;
+ out->discard = st->discard;
+
+ av_assert0(!out_sti->index_entries);
+ out_sti->index_entries = av_malloc(sti->index_entries_allocated_size);
+ if (!out_sti->index_entries)
+ return AVERROR(ENOMEM);
+
+ out_sti->index_entries_allocated_size = sti->index_entries_allocated_size;
+ out_sti->nb_index_entries = sti->nb_index_entries;
+ out_sti->skip_samples = sti->skip_samples;
+ memcpy(out_sti->index_entries, sti->index_entries, sti->index_entries_allocated_size);
+ }
+ }
+
+ return 0;
+}
+
static int mov_read_trak(MOVContext *c, AVIOContext *pb, MOVAtom atom)
{
AVStream *st;
@@ -4702,6 +4923,12 @@ static int mov_read_trak(MOVContext *c, AVIOContext *pb, MOVAtom atom)
mov_build_index(c, st);
+ if (sc->iamf) {
+ ret = mov_update_iamf_streams(c, st);
+ if (ret < 0)
+ return ret;
+ }
+
if (sc->dref_id-1 < sc->drefs_count && sc->drefs[sc->dref_id-1].path) {
MOVDref *dref = &sc->drefs[sc->dref_id - 1];
if (c->enable_drefs) {
@@ -4934,6 +5161,7 @@ static int heif_add_stream(MOVContext *c, HEIFItem *item)
st->priv_data = sc;
st->codecpar->codec_type = AVMEDIA_TYPE_VIDEO;
st->codecpar->codec_id = mov_codec_id(st, item->type);
+ sc->id = st->id;
sc->ffindex = st->index;
c->trak_index = st->index;
st->avg_frame_rate.num = st->avg_frame_rate.den = 1;
@@ -5032,6 +5260,7 @@ static int mov_read_tkhd(MOVContext *c, AVIOContext *pb, MOVAtom atom)
avio_rb32(pb); /* modification time */
}
st->id = (int)avio_rb32(pb); /* track id (NOT 0 !)*/
+ sc->id = st->id;
avio_rb32(pb); /* reserved */
/* highlevel (considering edits) duration in movie timebase */
@@ -5206,7 +5435,8 @@ static int mov_read_tfdt(MOVContext *c, AVIOContext *pb, MOVAtom atom)
int64_t base_media_decode_time;
for (i = 0; i < c->fc->nb_streams; i++) {
- if (c->fc->streams[i]->id == frag->track_id) {
+ sc = c->fc->streams[i]->priv_data;
+ if (sc->id == frag->track_id) {
st = c->fc->streams[i];
break;
}
@@ -5259,7 +5489,8 @@ static int mov_read_trun(MOVContext *c, AVIOContext *pb, MOVAtom atom)
}
for (i = 0; i < c->fc->nb_streams; i++) {
- if (c->fc->streams[i]->id == frag->track_id) {
+ sc = c->fc->streams[i]->priv_data;
+ if (sc->id == frag->track_id) {
st = c->fc->streams[i];
sti = ffstream(st);
break;
@@ -5562,7 +5793,8 @@ static int mov_read_sidx(MOVContext *c, AVIOContext *pb, MOVAtom atom)
track_id = avio_rb32(pb); // Reference ID
for (i = 0; i < c->fc->nb_streams; i++) {
- if (c->fc->streams[i]->id == track_id) {
+ sc = c->fc->streams[i]->priv_data;
+ if (sc->id == track_id) {
st = c->fc->streams[i];
break;
}
@@ -6454,7 +6686,8 @@ static int get_current_encryption_info(MOVContext *c, MOVEncryptionIndex **encry
frag_stream_info = get_current_frag_stream_info(&c->frag_index);
if (frag_stream_info) {
for (i = 0; i < c->fc->nb_streams; i++) {
- if (c->fc->streams[i]->id == frag_stream_info->id) {
+ *sc = c->fc->streams[i]->priv_data;
+ if ((*sc)->id == frag_stream_info->id) {
st = c->fc->streams[i];
break;
}
@@ -7398,7 +7631,7 @@ static int cenc_filter(MOVContext *mov, AVStream* st, MOVStreamContext *sc, AVPa
AVEncryptionInfo *encrypted_sample;
int encrypted_index, ret;
- frag_stream_info = get_frag_stream_info_from_pkt(&mov->frag_index, pkt, st->id);
+ frag_stream_info = get_frag_stream_info_from_pkt(&mov->frag_index, pkt, sc->id);
encrypted_index = current_index;
encryption_index = NULL;
if (frag_stream_info) {
@@ -8175,6 +8408,7 @@ static const MOVParseTableEntry mov_default_parse_table[] = {
{ MKTAG('i','s','p','e'), mov_read_ispe },
{ MKTAG('i','p','r','p'), mov_read_iprp },
{ MKTAG('i','i','n','f'), mov_read_iinf },
+{ MKTAG('i','a','c','b'), mov_read_iacb },
{ 0, NULL }
};
@@ -8406,11 +8640,13 @@ static void mov_read_chapters(AVFormatContext *s)
AVStream *st = NULL;
FFStream *sti = NULL;
chapter_track = mov->chapter_tracks[j];
- for (i = 0; i < s->nb_streams; i++)
- if (s->streams[i]->id == chapter_track) {
+ for (i = 0; i < s->nb_streams; i++) {
+ sc = mov->fc->streams[i]->priv_data;
+ if (sc->id == chapter_track) {
st = s->streams[i];
break;
}
+ }
if (!st) {
av_log(s, AV_LOG_ERROR, "Referenced QT chapter track not found\n");
continue;
@@ -8642,6 +8878,11 @@ static void mov_free_stream_context(AVFormatContext *s, AVStream *st)
av_freep(&sc->spherical);
av_freep(&sc->mastering);
av_freep(&sc->coll);
+
+ ff_iamf_uninit_context(sc->iamf);
+ av_freep(&sc->iamf);
+ av_freep(&sc->iamf_descriptors);
+ sc->iamf_descriptors_size = 0;
}
static int mov_read_close(AVFormatContext *s)
@@ -8896,9 +9137,11 @@ static int mov_read_header(AVFormatContext *s)
AVDictionaryEntry *tcr;
int tmcd_st_id = -1;
- for (j = 0; j < s->nb_streams; j++)
- if (s->streams[j]->id == sc->timecode_track)
+ for (j = 0; j < s->nb_streams; j++) {
+ MOVStreamContext *sc2 = s->streams[j]->priv_data;
+ if (sc2->id == sc->timecode_track)
tmcd_st_id = j;
+ }
if (tmcd_st_id < 0 || tmcd_st_id == i)
continue;
@@ -9211,7 +9454,15 @@ static int mov_read_packet(AVFormatContext *s, AVPacket *pkt)
if (st->codecpar->codec_id == AV_CODEC_ID_EIA_608 && sample->size > 8)
ret = get_eia608_packet(sc->pb, pkt, sample->size);
- else
+ else if (sc->iamf_descriptors_size) {
+ ret = av_new_packet(pkt, sc->iamf_descriptors_size);
+ if (ret < 0)
+ return ret;
+ pkt->pos = avio_tell(sc->pb);
+ memcpy(pkt->data, sc->iamf_descriptors, sc->iamf_descriptors_size);
+ sc->iamf_descriptors_size = 0;
+ ret = av_append_packet(sc->pb, pkt, sample->size);
+ } else
ret = av_get_packet(sc->pb, pkt, sample->size);
if (ret < 0) {
if (should_retry(sc->pb, ret)) {
--
2.43.0
More information about the ffmpeg-devel
mailing list