Decoding an audio file into raw data (decode, resampling, encode) using the FFmpeg API

My task is to open an existing audio file with the extension mka (Matroska container) and extract the audio data. Audio data can be of various types - PCM_ALAW, PCM_MULAW, S16LE, S16BE, GSM, OPUS and so on. The logic for converting an audio format from one to another (for example, from PCM_ALAW to PCM_MULAW) must also be implemented. The main problem I can't solve is the so-called RESAMPLING. But to find a solution to this problem, I will try describe in detail the logic of the project itself.

One of the most important decoder implementations is the conversion of extracted data from one format to another. To do this, we need to go through a few steps:

  1. Opening an incoming stream
bool AudioDecoder::openInputStream(void)
{
    qint32 errorCode = -1;
    if (p_frmCtx == nullptr) {
        // Open the input file to read from it
        if ((errorCode = avformat_open_input(&p_frmCtx,
                 m_settings.inputFile().toStdString().c_str(), nullptr, nullptr)) < 0) {
            p_frmCtx = nullptr;
            qDebug() << QString("Could not open input file: %1 (error: %2)").arg(
                            m_settings.inputFile()).arg(getMessageByErrorCode(errorCode));
            return false;
        }
        return true;
    }
    return false;
}

  1. Search for the audio stream index and audio codec definition
bool AudioDecoder::findInputStream(void)
{
    qint32 errorCode = -1;
    if (p_frmCtx != nullptr) {
        // Get information on the input file
        if ((errorCode = avformat_find_stream_info(p_frmCtx, nullptr)) < 0) {
            avformat_close_input(&p_frmCtx);
            qDebug() << QString("Could not open find stream info (error: %1)").arg(getMessageByErrorCode(errorCode));
            return false;
        }
        return true;
    }
    return false;
}
  1. Checking all streams for the content of the audio stream
bool AudioDecoder::checkInputStream(void)
{
    if (p_frmCtx != nullptr) {
        m_streamIndex = -1;
        if (p_frmCtx->nb_streams != 1) {
            for (quint32 i = 0; i < p_frmCtx->nb_streams; ++i) {
                if (p_frmCtx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
                    m_streamIndex = i;
                    return true;
                }
            }
        }
        if (p_frmCtx->streams[0]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
            m_streamIndex = 0;
            return true;
        }
        avformat_close_input(&p_frmCtx);
    }
    return false;
}
  1. Search for a suitable codec for decoding the stream
bool AudioDecoder::openDecoderForInputStream(void)
{
    // Find a decoder for the audio stream
    AVCodec* inputCodec = nullptr;
    if (!(inputCodec = avcodec_find_decoder(
              (p_frmCtx)->streams[m_streamIndex]->codecpar->codec_id))) {
        avformat_close_input(&p_frmCtx);
        return false;
    }
    // Allocate a new decoding context.
    p_iCdcCtx = avcodec_alloc_context3(inputCodec);
    if (!p_iCdcCtx) {
        avformat_close_input(&p_frmCtx);
        return false;
    }
    // Initialize the stream parameters with demuxer information.
    qint32 error = -1;
    if ((error = avcodec_parameters_to_context(p_iCdcCtx,
             (p_frmCtx)->streams[m_streamIndex]->codecpar)) < 0) {
        avformat_close_input(&p_frmCtx);
        avcodec_free_context(&p_iCdcCtx);
        return false;
    }
    // Open the decoder for the audio stream to use it later
    if ((error = avcodec_open2(p_iCdcCtx, inputCodec, nullptr)) < 0) {
        avcodec_free_context(&p_iCdcCtx);
        avformat_close_input(&p_frmCtx);
        return false;
    }
    av_dump_format(p_frmCtx, 0, m_settings.inputFile().toStdString().c_str(), 0);
    return true;
}
  1. Setting the desired codec for encoding decoded data
bool AudioDecoder::openEncoderForOutputStream(void)
{
    AVCodec* audio_avc = avcodec_find_encoder_by_name(
                m_settings.audioCodec().toStdString().c_str());
    if (!audio_avc) {
        return false;
    }
    p_oCdcCtx = avcodec_alloc_context3(audio_avc);
    if (!p_oCdcCtx) {
        return false;
    }
    
    p_oCdcCtx->channels              = m_settings.channelCount();
    p_oCdcCtx->channel_layout        = av_get_default_channel_layout(m_settings.channelCount());
    p_oCdcCtx->sample_rate           = m_settings.sampleRate();
    p_oCdcCtx->sample_fmt            = audio_avc->sample_fmts[0];
    p_oCdcCtx->bit_rate              = m_settings.constBitRate();
    p_oCdcCtx->time_base.num         = 1;
    p_oCdcCtx->time_base.num         = static_cast<int>(m_settings.sampleRate());
    p_oCdcCtx->strict_std_compliance = FF_COMPLIANCE_EXPERIMENTAL;
    if (avcodec_open2(p_oCdcCtx, audio_avc, nullptr) < 0) {
        return false;
    }
    return true;
}
  1. Extract all audio data, decode, encode, save to buffer
QByteArray AudioDecoder::getAllData(void)
{
    QByteArray arr;
    p_frame = av_frame_alloc();
    if (!p_frame) {
        return arr;
    }
    AVPacket p_packet;
    av_init_packet(&p_packet);
    int err = 0;

    while ((err = av_read_frame(p_frmCtx, &p_packet)) != AVERROR_EOF) {
        if (err != 0) {
            break;
        }

        // Does the packet belong to the correct stream ?
        if (p_packet.stream_index != m_streamIndex) {
            // Free the buffers used by the frame and reset all fields.
            av_packet_unref(&p_packet);
            continue;
        }

        int response = avcodec_send_packet(p_iCdcCtx, &p_packet);
        if (response < 0) {
            return arr;
        }

        while (response >= 0) {
            response = avcodec_receive_frame(p_iCdcCtx, p_frame);
            if (response == AVERROR(EAGAIN) || response == AVERROR_EOF) {
                break;
            }
            else if (response < 0) {
                return arr;
            }
            else {

                AVPacket* output_packet = av_packet_alloc();
                if (!output_packet) {
                    return arr;
                }

                int response = avcodec_send_frame(p_oCdcCtx, p_frame);
                while (response >= 0) {
                    response = avcodec_receive_packet(p_oCdcCtx, output_packet);
                    if (response == AVERROR(EAGAIN) || response == AVERROR_EOF) {
                        break;
                    }
                    else if (response < 0) {
                        return arr;
                    }
                    else {

                        av_packet_rescale_ts(output_packet, p_oCdcCtx->time_base, p_iCdcCtx->time_base);
                        output_packet->stream_index = m_streamIndex;
                        // Save decoded - encoded audio data
                        for (int i = 0; i < output_packet->size; ++i) {
                            arr.push_back(output_packet->data[i]);
                        }
                    }
                }
                av_packet_unref(output_packet);
                av_packet_free(&output_packet);
                av_frame_unref(p_frame);
            }
        }
        av_packet_unref(&p_packet);
    }
    av_packet_unref(&p_packet);
    av_frame_free(&p_frame);
    return arr;
}

These are the main functions for extracting audio data, decoding and encoding it into the format we need.

As stated earlier, my problem is that I can't properly implement the frequency conversion process sampling, changing the number of channels and the format of audio samples (Resampling ) before encoding the data and saving it to the buffer. I tried the logic from the given example, but I ran into a problem.

When testing, I used an audio file with sound stereo (2 channels) as the input file. When exiting, I set it to mono sound (1 channel). But the audio file began to play with great interference. Also, I couldn't implement the frequency change sampling rate. How do I implement this ?

I only know that you need to use Resampling before encoding it to another format (encoder). I think that this example has the necessary functionality to implement Resampling.

I attach the full project code:

PS The AudioDecoderSettings class is ignored, as it is a simple decoder setup class.

Also, it is worth knowing that this is just a test version, and most likely ( () there is memory leaks and other problems.

Main.cpp

int main(int argc, char *argv[])
{
    AudioDecoderSettings settings;
    // Входной файл
    settings.setInputFile("/home/test/ulaw-8000-64kbs-ch2.mka"); 
    // На выходе, я хочу получить такую дискретизацию
    settings.setSampleRate(8000);
    // На выходе, я хочу получить такое количество каналов
    settings.setChannelCount(1); 
    // На выходе, я хочу получить такой формат аудио
    settings.setAudioCodec("pcm_alaw"); 
    // На выходе, я хочу получить такой битрейт
    settings.setConstBitRate(64000); 

    AudioDecoder decoder(settings);

    // Инициализирую все зависимости библиотеки и
    // открывю входящий файл для извлечения аудиоданных.
    if (!decoder.init()) {
        return EXIT_FAILURE;
    }

    // Файл, в котором будут сохранены извлеченные аудиоданные.
    QFile file("/home/test/pcm_alaw.bin");
    file.open(QIODevice::WriteOnly);
    auto res = decoder.getAllData();
    file.write(res.data(), res.size());
    file.close();
    return EXIT_SUCCESS;
}

Audiodecoder.h

class AudioDecoder {
public:
    AudioDecoder(const AudioDecoderSettings& settings);
    AudioDecoder& operator=(const AudioDecoder& other) = delete;
    AudioDecoder& operator=(AudioDecoder&& other) = delete;
    AudioDecoder(const AudioDecoder& other) = delete;
    AudioDecoder(AudioDecoder&& other) = delete;
    ~AudioDecoder(void);

    bool init(void);
    QByteArray getAllData(void);
    bool term(void);

protected:
    QString getMessageByErrorCode(const qint32& code);
    bool openInputStream(void);
    bool checkInputStream(void);
    bool findInputStream(void);
    bool initResampler(void);
    bool openDecoderForInputStream(void);
    bool openEncoderForOutputStream(void);

protected:
    int m_streamIndex{ 0 };
    bool m_initialized{ false };
    AVFrame* p_frame = nullptr;
    AVFormatContext* p_frmCtx = nullptr;
    AVCodecContext* p_iCdcCtx = nullptr;
    AVCodecContext* p_oCdcCtx = nullptr;
    AudioDecoderSettings m_settings;
    struct SwrContext* swr_ctx;
};

Aduiodecoder.cpp

void printStreamInformation(const AVCodecContext* codecCtx, int audioStreamIndex)
{
    fprintf(stderr, "Codec: %s\n", codecCtx->codec->long_name);
    if (codecCtx->codec->sample_fmts != NULL) {
        fprintf(stderr, "Supported sample formats: ");
        for (int i = 0; codecCtx->codec->sample_fmts[i] != -1; ++i) {
            fprintf(stderr, "%s", av_get_sample_fmt_name(codecCtx->codec->sample_fmts[i]));
            if (codecCtx->codec->sample_fmts[i + 1] != -1) {
                fprintf(stderr, ", ");
            }
        }
        fprintf(stderr, "\n");
    }
    fprintf(stderr, "---------\n");
    fprintf(stderr, "Stream:        %7d\n", audioStreamIndex);
    fprintf(stderr, "Sample Format: %7s\n", av_get_sample_fmt_name(codecCtx->sample_fmt));
    fprintf(stderr, "Sample Rate:   %7d\n", codecCtx->sample_rate);
    fprintf(stderr, "Sample Size:   %7d\n", av_get_bytes_per_sample(codecCtx->sample_fmt));
    fprintf(stderr, "Channels:      %7d\n", codecCtx->channels);
    fprintf(stderr, "Planar:        %7d\n", av_sample_fmt_is_planar(codecCtx->sample_fmt));
}

bool AudioDecoder::openInputStream(void)
{
    qint32 errorCode = -1;
    if (p_frmCtx == nullptr) {
        // Open the input file to read from it
        if ((errorCode = avformat_open_input(&p_frmCtx,
                 m_settings.inputFile().toStdString().c_str(), nullptr, nullptr)) < 0) {
            p_frmCtx = nullptr;
            qDebug() << QString("Could not open input file: %1 (error: %2)").arg(
                            m_settings.inputFile()).arg(getMessageByErrorCode(errorCode));
            return false;
        }
        return true;
    }
    return false;
}

bool AudioDecoder::findInputStream(void)
{
    qint32 errorCode = -1;
    if (p_frmCtx != nullptr) {
        // Get information on the input file
        if ((errorCode = avformat_find_stream_info(p_frmCtx, nullptr)) < 0) {
            avformat_close_input(&p_frmCtx);
            qDebug() << QString("Could not open find stream info (error: %1)").arg(getMessageByErrorCode(errorCode));
            return false;
        }
        return true;
    }
    return false;
}

bool AudioDecoder::checkInputStream(void)
{
    if (p_frmCtx != nullptr) {
        m_streamIndex = -1;
        if (p_frmCtx->nb_streams != 1) {
            for (quint32 i = 0; i < p_frmCtx->nb_streams; ++i) {
                if (p_frmCtx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
                    m_streamIndex = i;
                    return true;
                }
            }
        }
        if (p_frmCtx->streams[0]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
            m_streamIndex = 0;
            return true;
        }
        avformat_close_input(&p_frmCtx);
    }
    return false;
}

bool AudioDecoder::openDecoderForInputStream(void)
{
    // Find a decoder for the audio stream
    AVCodec* inputCodec = nullptr;
    if (!(inputCodec = avcodec_find_decoder(
              (p_frmCtx)->streams[m_streamIndex]->codecpar->codec_id))) {
        avformat_close_input(&p_frmCtx);
        return false;
    }
    // Allocate a new decoding context.
    p_iCdcCtx = avcodec_alloc_context3(inputCodec);
    if (!p_iCdcCtx) {
        avformat_close_input(&p_frmCtx);
        return false;
    }
    // Initialize the stream parameters with demuxer information.
    qint32 error = -1;
    if ((error = avcodec_parameters_to_context(p_iCdcCtx,
             (p_frmCtx)->streams[m_streamIndex]->codecpar)) < 0) {
        avformat_close_input(&p_frmCtx);
        avcodec_free_context(&p_iCdcCtx);
        return false;
    }
    // Open the decoder for the audio stream to use it later
    if ((error = avcodec_open2(p_iCdcCtx, inputCodec, nullptr)) < 0) {
        avcodec_free_context(&p_iCdcCtx);
        avformat_close_input(&p_frmCtx);
        return false;
    }
    av_dump_format(p_frmCtx, 0, m_settings.inputFile().toStdString().c_str(), 0);
    return true;
}

bool AudioDecoder::init(void)
{
    if (m_initialized) {
        qDebug() << "The decoder is already initialized";
        return true;
    }
    if (!openInputStream()) {
        return false;
    }
    if (!findInputStream()) {
        return false;
    }
    if (!checkInputStream()) {
        return false;
    }
    if (!openDecoderForInputStream()) {
        return false;
    }
    if (!openEncoderForOutputStream()) {
        return false;
    }
    if (!initResampler()) {
        return false;
    }

    printStreamInformation(p_iCdcCtx, m_streamIndex);

    return m_initialized = true;
}

QString AudioDecoder::getMessageByErrorCode(const qint32& code)
{
    if (code < 0) {
        char errorBuffer[255]{ '0' };
        av_strerror(code, errorBuffer, sizeof(errorBuffer));
        return QString(errorBuffer);
    }
    return QString();
}

AudioDecoder::AudioDecoder(const AudioDecoderSettings& settings)
    : m_streamIndex(0)
    , m_initialized(false)
    , p_frame(nullptr)
    , p_frmCtx(nullptr)
    , p_iCdcCtx(nullptr)
    , p_oCdcCtx(nullptr)
    , m_settings(settings)
{
    av_register_all();
    avcodec_register_all();
}

AudioDecoder::~AudioDecoder(void)
{
    term();
}

bool AudioDecoder::term(void)
{
    if (!m_initialized) {
        return false;
    }
    if (swr_ctx) {
        swr_free(&swr_ctx);
    }
    if (p_iCdcCtx != nullptr) {
        avcodec_free_context(&p_iCdcCtx);
    }
    if (p_oCdcCtx != nullptr) {
        avcodec_free_context(&p_oCdcCtx);
    }
    if (p_frmCtx != nullptr) {
        avformat_close_input(&p_frmCtx);
    }
    m_initialized = false;
    return true;
}

bool AudioDecoder::openEncoderForOutputStream(void)
{
    AVCodec* audio_avc = avcodec_find_encoder_by_name(
                m_settings.audioCodec().toStdString().c_str());
    if (!audio_avc) {
        return false;
    }
    p_oCdcCtx = avcodec_alloc_context3(audio_avc);
    if (!p_oCdcCtx) {
        return false;
    }

    p_oCdcCtx->channels              = m_settings.channelCount();
    p_oCdcCtx->channel_layout        = av_get_default_channel_layout(m_settings.channelCount());
    p_oCdcCtx->sample_rate           = m_settings.sampleRate();
    p_oCdcCtx->sample_fmt            = audio_avc->sample_fmts[0];
    p_oCdcCtx->bit_rate              = m_settings.constBitRate();
    p_oCdcCtx->time_base.num         = 1;
    p_oCdcCtx->time_base.num         = static_cast<int>(m_settings.sampleRate());
    p_oCdcCtx->strict_std_compliance = FF_COMPLIANCE_EXPERIMENTAL;
    if (avcodec_open2(p_oCdcCtx, audio_avc, nullptr) < 0) {
        return false;
    }
    return true;
}

QByteArray AudioDecoder::getAllData(void)
{
    QByteArray arr;
    p_frame = av_frame_alloc();
    if (!p_frame) {
        return arr;
    }
    AVPacket p_packet;
    av_init_packet(&p_packet);
    int err = 0;

    while ((err = av_read_frame(p_frmCtx, &p_packet)) != AVERROR_EOF) {
        if (err != 0) {
            break;
        }

        // Does the packet belong to the correct stream ?
        if (p_packet.stream_index != m_streamIndex) {
            // Free the buffers used by the frame and reset all fields.
            av_packet_unref(&p_packet);
            continue;
        }

        int response = avcodec_send_packet(p_iCdcCtx, &p_packet);
        if (response < 0) {
            return arr;
        }

        while (response >= 0) {
            response = avcodec_receive_frame(p_iCdcCtx, p_frame);
            if (response == AVERROR(EAGAIN) || response == AVERROR_EOF) {
                break;
            }
            else if (response < 0) {
                return arr;
            }
            else {

                AVPacket* output_packet = av_packet_alloc();
                if (!output_packet) {
                    return arr;
                }

                int response = avcodec_send_frame(p_oCdcCtx, p_frame);
                while (response >= 0) {
                    response = avcodec_receive_packet(p_oCdcCtx, output_packet);
                    if (response == AVERROR(EAGAIN) || response == AVERROR_EOF) {
                        break;
                    }
                    else if (response < 0) {
                        return arr;
                    }
                    else {

                        av_packet_rescale_ts(output_packet, p_oCdcCtx->time_base, p_iCdcCtx->time_base);
                        output_packet->stream_index = m_streamIndex;
                        // Save decoded - encoded audio data
                        for (int i = 0; i < output_packet->size; ++i) {
                            arr.push_back(output_packet->data[i]);
                        }
                    }
                }
                av_packet_unref(output_packet);
                av_packet_free(&output_packet);
                av_frame_unref(p_frame);
            }
        }
        av_packet_unref(&p_packet);
    }
    av_packet_unref(&p_packet);
    av_frame_free(&p_frame);
    return arr;
}
Author: bbdd, 2020-09-17