ffmpeg: Correctly handle sample rates

Previously, we just used the native sample rate for encoding. However, some encoders like libmp3lame doesn't support it. Therefore, we now use a supported sample rate (preferring the native one if possible).

FFmpeg requires audio data to be sent in a sequence of frames, each containing the same specific number of samples. Previously, we buffered input samples in FFmpegBackend. However, as the source and destination sample rates can now be different, we should buffer resampled data instead. swresample have an internal input buffer, so we now just forward all data to it and 'gradually' receive resampled data, at most one frame_size at a time. When there is not enough resampled data to form a frame, we will record the current offset and request for less data on the next call.

Additionally, this commit also fixes a flaw. When an encoder supports variable frame sizes, its frame size is reported to be 0, which breaks our buffering system. Now we treat variable frame size encoders as having a frame size of 160 (the size of a HLE audio frame).
This commit is contained in:
zhupengfei 2020-02-01 12:23:07 +08:00
parent 8b9c01ded9
commit 4161163d9c
No known key found for this signature in database
GPG key ID: DD129E108BD09378
2 changed files with 84 additions and 64 deletions

View file

@ -211,7 +211,7 @@ bool FFmpegAudioStream::Init(AVFormatContext* format_context) {
if (!FFmpegStream::Init(format_context)) if (!FFmpegStream::Init(format_context))
return false; return false;
sample_count = 0; frame_count = 0;
// Initialize audio codec // Initialize audio codec
const AVCodec* codec = avcodec_find_encoder_by_name(Settings::values.audio_encoder.c_str()); const AVCodec* codec = avcodec_find_encoder_by_name(Settings::values.audio_encoder.c_str());
@ -243,7 +243,20 @@ bool FFmpegAudioStream::Init(AVFormatContext* format_context) {
codec_context->sample_fmt = AV_SAMPLE_FMT_S16P; codec_context->sample_fmt = AV_SAMPLE_FMT_S16P;
} }
if (codec->supported_samplerates) {
codec_context->sample_rate = codec->supported_samplerates[0];
// Prefer native sample rate if supported
const int* ptr = codec->supported_samplerates;
while ((*ptr)) {
if ((*ptr) == AudioCore::native_sample_rate) {
codec_context->sample_rate = AudioCore::native_sample_rate; codec_context->sample_rate = AudioCore::native_sample_rate;
break;
}
ptr++;
}
} else {
codec_context->sample_rate = AudioCore::native_sample_rate;
}
codec_context->channel_layout = AV_CH_LAYOUT_STEREO; codec_context->channel_layout = AV_CH_LAYOUT_STEREO;
codec_context->channels = 2; codec_context->channels = 2;
@ -259,6 +272,12 @@ bool FFmpegAudioStream::Init(AVFormatContext* format_context) {
LOG_WARNING(Render, "Audio encoder options not found: {}", buf); LOG_WARNING(Render, "Audio encoder options not found: {}", buf);
} }
if (codec_context->frame_size) {
frame_size = static_cast<u64>(codec_context->frame_size);
} else { // variable frame size support
frame_size = std::tuple_size<AudioCore::StereoFrame16>::value;
}
// Create audio stream // Create audio stream
stream = avformat_new_stream(format_context, codec); stream = avformat_new_stream(format_context, codec);
if (!stream || avcodec_parameters_from_context(stream->codecpar, codec_context.get()) < 0) { if (!stream || avcodec_parameters_from_context(stream->codecpar, codec_context.get()) < 0) {
@ -291,7 +310,7 @@ bool FFmpegAudioStream::Init(AVFormatContext* format_context) {
// Allocate resampled data // Allocate resampled data
int error = int error =
av_samples_alloc_array_and_samples(&resampled_data, nullptr, codec_context->channels, av_samples_alloc_array_and_samples(&resampled_data, nullptr, codec_context->channels,
codec_context->frame_size, codec_context->sample_fmt, 0); frame_size, codec_context->sample_fmt, 0);
if (error < 0) { if (error < 0) {
LOG_ERROR(Render, "Could not allocate samples storage"); LOG_ERROR(Render, "Could not allocate samples storage");
return false; return false;
@ -312,31 +331,62 @@ void FFmpegAudioStream::Free() {
av_freep(&resampled_data); av_freep(&resampled_data);
} }
void FFmpegAudioStream::ProcessFrame(VariableAudioFrame& channel0, VariableAudioFrame& channel1) { void FFmpegAudioStream::ProcessFrame(const VariableAudioFrame& channel0,
const VariableAudioFrame& channel1) {
ASSERT_MSG(channel0.size() == channel1.size(), ASSERT_MSG(channel0.size() == channel1.size(),
"Frames of the two channels must have the same number of samples"); "Frames of the two channels must have the same number of samples");
std::array<const u8*, 2> src_data = {reinterpret_cast<u8*>(channel0.data()),
reinterpret_cast<u8*>(channel1.data())};
if (swr_convert(swr_context.get(), resampled_data, channel0.size(), src_data.data(),
channel0.size()) < 0) {
const auto sample_size = av_get_bytes_per_sample(codec_context->sample_fmt);
std::array<const u8*, 2> src_data = {reinterpret_cast<const u8*>(channel0.data()),
reinterpret_cast<const u8*>(channel1.data())};
std::array<u8*, 2> dst_data = {resampled_data[0] + sample_size * offset,
resampled_data[1] + sample_size * offset};
auto resampled_count = swr_convert(swr_context.get(), dst_data.data(), frame_size - offset,
src_data.data(), channel0.size());
if (resampled_count < 0) {
LOG_ERROR(Render, "Audio frame dropped: Could not resample data"); LOG_ERROR(Render, "Audio frame dropped: Could not resample data");
return; return;
} }
offset += resampled_count;
if (offset < frame_size) { // Still not enough to form a frame
return;
}
while (true) {
// Prepare frame // Prepare frame
audio_frame->nb_samples = channel0.size(); audio_frame->nb_samples = frame_size;
audio_frame->data[0] = resampled_data[0]; audio_frame->data[0] = resampled_data[0];
audio_frame->data[1] = resampled_data[1]; audio_frame->data[1] = resampled_data[1];
audio_frame->pts = sample_count; audio_frame->pts = frame_count * frame_size;
sample_count += channel0.size(); frame_count++;
SendFrame(audio_frame.get()); SendFrame(audio_frame.get());
// swr_convert buffers input internally. Try to get more resampled data
resampled_count = swr_convert(swr_context.get(), resampled_data, frame_size, nullptr, 0);
if (resampled_count < 0) {
LOG_ERROR(Render, "Audio frame dropped: Could not resample data");
return;
}
if (static_cast<u64>(resampled_count) < frame_size) {
offset = resampled_count;
break;
}
}
} }
std::size_t FFmpegAudioStream::GetAudioFrameSize() const { void FFmpegAudioStream::Flush() {
ASSERT_MSG(codec_context, "Codec context is not initialized yet!"); // Send the last samples
return codec_context->frame_size; audio_frame->nb_samples = offset;
audio_frame->data[0] = resampled_data[0];
audio_frame->data[1] = resampled_data[1];
audio_frame->pts = frame_count * frame_size;
SendFrame(audio_frame.get());
FFmpegStream::Flush();
} }
FFmpegMuxer::~FFmpegMuxer() { FFmpegMuxer::~FFmpegMuxer() {
@ -402,7 +452,8 @@ void FFmpegMuxer::ProcessVideoFrame(VideoFrame& frame) {
video_stream.ProcessFrame(frame); video_stream.ProcessFrame(frame);
} }
void FFmpegMuxer::ProcessAudioFrame(VariableAudioFrame& channel0, VariableAudioFrame& channel1) { void FFmpegMuxer::ProcessAudioFrame(const VariableAudioFrame& channel0,
const VariableAudioFrame& channel1) {
audio_stream.ProcessFrame(channel0, channel1); audio_stream.ProcessFrame(channel0, channel1);
} }
@ -414,10 +465,6 @@ void FFmpegMuxer::FlushAudio() {
audio_stream.Flush(); audio_stream.Flush();
} }
std::size_t FFmpegMuxer::GetAudioFrameSize() const {
return audio_stream.GetAudioFrameSize();
}
void FFmpegMuxer::WriteTrailer() { void FFmpegMuxer::WriteTrailer() {
av_write_trailer(format_context.get()); av_write_trailer(format_context.get());
} }
@ -498,24 +545,20 @@ void FFmpegBackend::AddVideoFrame(VideoFrame frame) {
} }
void FFmpegBackend::AddAudioFrame(AudioCore::StereoFrame16 frame) { void FFmpegBackend::AddAudioFrame(AudioCore::StereoFrame16 frame) {
std::array<std::array<s16, 160>, 2> refactored_frame; std::array<VariableAudioFrame, 2> refactored_frame;
for (auto& channel : refactored_frame) {
channel.resize(frame.size());
}
for (std::size_t i = 0; i < frame.size(); i++) { for (std::size_t i = 0; i < frame.size(); i++) {
refactored_frame[0][i] = frame[i][0]; refactored_frame[0][i] = frame[i][0];
refactored_frame[1][i] = frame[i][1]; refactored_frame[1][i] = frame[i][1];
} }
for (auto i : {0, 1}) { ffmpeg.ProcessAudioFrame(refactored_frame[0], refactored_frame[1]);
audio_buffers[i].insert(audio_buffers[i].end(), refactored_frame[i].begin(),
refactored_frame[i].end());
}
CheckAudioBuffer();
} }
void FFmpegBackend::AddAudioSample(const std::array<s16, 2>& sample) { void FFmpegBackend::AddAudioSample(const std::array<s16, 2>& sample) {
for (auto i : {0, 1}) { ffmpeg.ProcessAudioFrame({sample[0]}, {sample[1]});
audio_buffers[i].push_back(sample[i]);
}
CheckAudioBuffer();
} }
void FFmpegBackend::StopDumping() { void FFmpegBackend::StopDumping() {
@ -525,12 +568,6 @@ void FFmpegBackend::StopDumping() {
// Flush the video processing queue // Flush the video processing queue
AddVideoFrame(VideoFrame()); AddVideoFrame(VideoFrame());
for (auto i : {0, 1}) { for (auto i : {0, 1}) {
// Add remaining data to audio queue
if (audio_buffers[i].size() >= 0) {
VariableAudioFrame buffer(audio_buffers[i].begin(), audio_buffers[i].end());
audio_frame_queues[i].Push(std::move(buffer));
audio_buffers[i].clear();
}
// Flush the audio processing queue // Flush the audio processing queue
audio_frame_queues[i].Push(VariableAudioFrame()); audio_frame_queues[i].Push(VariableAudioFrame());
} }
@ -554,18 +591,4 @@ void FFmpegBackend::EndDumping() {
processing_ended.Set(); processing_ended.Set();
} }
void FFmpegBackend::CheckAudioBuffer() {
for (auto i : {0, 1}) {
const std::size_t frame_size = ffmpeg.GetAudioFrameSize();
// Add audio data to the queue when there is enough to form a frame
while (audio_buffers[i].size() >= frame_size) {
VariableAudioFrame buffer(audio_buffers[i].begin(),
audio_buffers[i].begin() + frame_size);
audio_frame_queues[i].Push(std::move(buffer));
audio_buffers[i].erase(audio_buffers[i].begin(), audio_buffers[i].begin() + frame_size);
}
}
}
} // namespace VideoDumper } // namespace VideoDumper

View file

@ -96,6 +96,7 @@ private:
/** /**
* A FFmpegStream used for audio data. * A FFmpegStream used for audio data.
* Resamples (converts), encodes and writes a frame. * Resamples (converts), encodes and writes a frame.
* This also temporarily stores resampled audio data before there are enough to form a frame.
*/ */
class FFmpegAudioStream : public FFmpegStream { class FFmpegAudioStream : public FFmpegStream {
public: public:
@ -103,8 +104,8 @@ public:
bool Init(AVFormatContext* format_context); bool Init(AVFormatContext* format_context);
void Free(); void Free();
void ProcessFrame(VariableAudioFrame& channel0, VariableAudioFrame& channel1); void ProcessFrame(const VariableAudioFrame& channel0, const VariableAudioFrame& channel1);
std::size_t GetAudioFrameSize() const; void Flush();
private: private:
struct SwrContextDeleter { struct SwrContextDeleter {
@ -113,12 +114,14 @@ private:
} }
}; };
u64 sample_count{}; u64 frame_size{};
u64 frame_count{};
std::unique_ptr<AVFrame, AVFrameDeleter> audio_frame{}; std::unique_ptr<AVFrame, AVFrameDeleter> audio_frame{};
std::unique_ptr<SwrContext, SwrContextDeleter> swr_context{}; std::unique_ptr<SwrContext, SwrContextDeleter> swr_context{};
u8** resampled_data{}; u8** resampled_data{};
u64 offset{}; // Number of output samples that are currently in resampled_data.
}; };
/** /**
@ -132,10 +135,9 @@ public:
bool Init(const std::string& path, const Layout::FramebufferLayout& layout); bool Init(const std::string& path, const Layout::FramebufferLayout& layout);
void Free(); void Free();
void ProcessVideoFrame(VideoFrame& frame); void ProcessVideoFrame(VideoFrame& frame);
void ProcessAudioFrame(VariableAudioFrame& channel0, VariableAudioFrame& channel1); void ProcessAudioFrame(const VariableAudioFrame& channel0, const VariableAudioFrame& channel1);
void FlushVideo(); void FlushVideo();
void FlushAudio(); void FlushAudio();
std::size_t GetAudioFrameSize() const;
void WriteTrailer(); void WriteTrailer();
private: private:
@ -153,8 +155,7 @@ private:
/** /**
* FFmpeg video dumping backend. * FFmpeg video dumping backend.
* This class implements a double buffer, and an audio queue to keep audio data * This class implements a double buffer.
* before enough data is received to form a frame.
*/ */
class FFmpegBackend : public Backend { class FFmpegBackend : public Backend {
public: public:
@ -169,7 +170,6 @@ public:
Layout::FramebufferLayout GetLayout() const override; Layout::FramebufferLayout GetLayout() const override;
private: private:
void CheckAudioBuffer();
void EndDumping(); void EndDumping();
std::atomic_bool is_dumping = false; ///< Whether the backend is currently dumping std::atomic_bool is_dumping = false; ///< Whether the backend is currently dumping
@ -182,9 +182,6 @@ private:
Common::Event event1, event2; Common::Event event1, event2;
std::thread video_processing_thread; std::thread video_processing_thread;
/// An audio buffer used to temporarily hold audio data, before the size is big enough
/// to be sent to the encoder as a frame
std::array<VariableAudioFrame, 2> audio_buffers;
std::array<Common::SPSCQueue<VariableAudioFrame>, 2> audio_frame_queues; std::array<Common::SPSCQueue<VariableAudioFrame>, 2> audio_frame_queues;
std::thread audio_processing_thread; std::thread audio_processing_thread;