ScreenCapture: Single Header DirectX Library with HDR ability

Michael Chourdakis

5.00/5 (18 votes)

Jan 18, 2020

CPOL

4 min read

58020

DirectX hardware screen capture and encoding with audio mixing capabilities. H264/H265/VP80/VP90/FLAC/MP3. HDR supported.

Download source from Github @ https://github.com/WindowsNT/ScreenCapture

Introduction

Lots of stuff out there about it. Here is a simple, single header file, hardware accelerated. If using Windows 8 or later, you can easily include it in your projects.

Requirements

Windows 8 or later

Video Capture

We need to enumerate our adapters and the number of the monitors, with the aid of DXGI:

static void GetAdapters(std::vector<CComPtr<IDXGIAdapter1>>& a)
{
    CComPtr<IDXGIFactory1> df;
    CreateDXGIFactory1(__uuidof(IDXGIFactory1),(void**)&df);
    a.clear();
    if (!df)
        return;
    int L = 0;
    for (;;)
    {
        CComPtr<IDXGIAdapter1> lDxgiAdapter;
        df->EnumAdapters1(L, &lDxgiAdapter);
        if (!lDxgiAdapter)
            break;
        L++;
        a.push_back(lDxgiAdapter);
    }
}

Then, we would instantiate a DirectX 11 device with one of them, or the default:

HRESULT CreateDirect3DDevice(IDXGIAdapter1* g)
{
    HRESULT hr = S_OK;

    // Driver types supported
    D3D_DRIVER_TYPE DriverTypes[] =
    {
        D3D_DRIVER_TYPE_HARDWARE,
        D3D_DRIVER_TYPE_WARP,
        D3D_DRIVER_TYPE_REFERENCE,
    };
    UINT NumDriverTypes = ARRAYSIZE(DriverTypes);

    // Feature levels supported
    D3D_FEATURE_LEVEL FeatureLevels[] =
    {
        D3D_FEATURE_LEVEL_11_0,
        D3D_FEATURE_LEVEL_10_1,
        D3D_FEATURE_LEVEL_10_0,
        D3D_FEATURE_LEVEL_9_3,
        D3D_FEATURE_LEVEL_9_2,
        D3D_FEATURE_LEVEL_9_1
    };
    UINT NumFeatureLevels = ARRAYSIZE(FeatureLevels);

    D3D_FEATURE_LEVEL FeatureLevel;

    // Create device
    for (UINT DriverTypeIndex = 0; DriverTypeIndex < NumDriverTypes; ++DriverTypeIndex)
    {
        hr = D3D11CreateDevice(g, DriverTypes[DriverTypeIndex], 
             nullptr, D3D11_CREATE_DEVICE_VIDEO_SUPPORT, FeatureLevels, NumFeatureLevels,
             D3D11_SDK_VERSION, &device, &FeatureLevel, &context);
        if (SUCCEEDED(hr))
        {
            // Device creation success, no need to loop anymore
            break;
        }
    }
    if (FAILED(hr))
        return hr;

    return S_OK;
}

We want to create the Desktop Duplication of the output then:

bool Prepare(UINT Output = 0)
{
// Get DXGI device
    CComPtr<IDXGIDevice> lDxgiDevice;
    lDxgiDevice = device;
    if (!lDxgiDevice)
        return 0;

    // Get DXGI adapter
    CComPtr<IDXGIAdapter> lDxgiAdapter;
    auto hr = lDxgiDevice->GetParent(
        __uuidof(IDXGIAdapter),
        reinterpret_cast<void**>(&lDxgiAdapter));

    if (FAILED(hr))
        return 0;

    lDxgiDevice = 0;

    // Get output
    CComPtr<IDXGIOutput> lDxgiOutput;
    hr = lDxgiAdapter->EnumOutputs(Output, &lDxgiOutput);
    if (FAILED(hr))
        return 0;

    lDxgiAdapter = 0;

    DXGI_OUTPUT_DESC lOutputDesc;
    hr = lDxgiOutput->GetDesc(&lOutputDesc);

    // QI for Output 1
    CComPtr<IDXGIOutput1> lDxgiOutput1;
    lDxgiOutput1 = lDxgiOutput;
    if (!lDxgiOutput1)
        return 0;

    lDxgiOutput = 0;

    // Create desktop duplication
    hr = lDxgiOutput1->DuplicateOutput(
        device,
        &lDeskDupl);

    if (FAILED(hr))
        return 0;

    lDxgiOutput1 = 0;

    // Create GUI drawing texture
    lDeskDupl->GetDesc(&lOutputDuplDesc);
    D3D11_TEXTURE2D_DESC desc = {};
    desc.Width = lOutputDuplDesc.ModeDesc.Width;
    desc.Height = lOutputDuplDesc.ModeDesc.Height;
    desc.Format = lOutputDuplDesc.ModeDesc.Format;
    desc.ArraySize = 1;
    desc.BindFlags = D3D11_BIND_FLAG::D3D11_BIND_RENDER_TARGET;
    desc.MiscFlags = D3D11_RESOURCE_MISC_GDI_COMPATIBLE;
    desc.SampleDesc.Count = 1;
    desc.SampleDesc.Quality = 0;
    desc.MipLevels = 1;
    desc.CPUAccessFlags = 0;
    desc.Usage = D3D11_USAGE_DEFAULT;
    hr = device->CreateTexture2D(&desc, NULL, &lGDIImage);
    if (FAILED(hr))
        return 0;

    if (lGDIImage == nullptr)
        return 0;

    // Create CPU access texture
    desc.Width = lOutputDuplDesc.ModeDesc.Width;
    desc.Height = lOutputDuplDesc.ModeDesc.Height;
    desc.Format = lOutputDuplDesc.ModeDesc.Format;
    desc.ArraySize = 1;
    desc.BindFlags = 0;
    desc.MiscFlags = 0;
    desc.SampleDesc.Count = 1;
    desc.SampleDesc.Quality = 0;
    desc.MipLevels = 1;
    desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE;
    desc.Usage = D3D11_USAGE_STAGING;
    hr = device->CreateTexture2D(&desc, NULL, &lDestImage);
    if (FAILED(hr))
        return 0;

    if (lDestImage == nullptr)
        return 0;

    return 1;
}

To get the screenshot, we loop:

hr = cap.lDeskDupl->AcquireNextFrame(
    0,
    &lFrameInfo,
    &lDesktopResource);
if (hr == DXGI_ERROR_WAIT_TIMEOUT)
    hr = S_OK;
if (FAILED(hr))
    break;
if (lDesktopResource && !cap.Get(lDesktopResource, dp.Cursor, 
                                 dp.rx.right && dp.rx.bottom ? &dp.rx : 0))
    break;

The method get() will return us the bitmap, cursor optionally included and cropped:

bool Get(IDXGIResource* lDesktopResource,bool Curs,RECT* rcx = 0)
{
    // QI for ID3D11Texture2D
    CComPtr<ID3D11Texture2D> lAcquiredDesktopImage;
    if (!lDesktopResource)
        return 0;
    auto hr = lDesktopResource->QueryInterface(IID_PPV_ARGS(&lAcquiredDesktopImage));
    if (!lAcquiredDesktopImage)
        return 0;
    lDesktopResource = 0;

    // Copy image into GDI drawing texture
    context->CopyResource(lGDIImage, lAcquiredDesktopImage);

    // Draw cursor image into GDI drawing texture
    CComPtr<IDXGISurface1> lIDXGISurface1;

    lIDXGISurface1 = lGDIImage;

    if (!lIDXGISurface1)
        return 0;

    CURSORINFO lCursorInfo = { 0 };
    lCursorInfo.cbSize = sizeof(lCursorInfo);
    auto lBoolres = GetCursorInfo(&lCursorInfo);
    if (lBoolres == TRUE)
    {
        if (lCursorInfo.flags == CURSOR_SHOWING && Curs)
        {
            auto lCursorPosition = lCursorInfo.ptScreenPos;
//                auto lCursorSize = lCursorInfo.cbSize;
            HDC  lHDC;
            lIDXGISurface1->GetDC(FALSE, &lHDC);
            DrawIconEx(
                lHDC,
                lCursorPosition.x,
                lCursorPosition.y,
                lCursorInfo.hCursor,
                0,
                0,
                0,
                0,
                DI_NORMAL | DI_DEFAULTSIZE);
            lIDXGISurface1->ReleaseDC(nullptr);
        }
    }

    // Copy image into CPU access texture
    context->CopyResource(lDestImage, lGDIImage);

    // Copy from CPU access texture to bitmap buffer
    D3D11_MAPPED_SUBRESOURCE resource;
    UINT subresource = D3D11CalcSubresource(0, 0, 0);
    hr = context->Map(lDestImage, subresource, D3D11_MAP_READ_WRITE, 0, &resource);
    if (FAILED(hr))
        return 0;

    auto sz = lOutputDuplDesc.ModeDesc.Width
        * lOutputDuplDesc.ModeDesc.Height * 4;
    auto sz2 = sz;
    buf.resize(sz);
    if (rcx)
    {
        sz2 = (rcx->right - rcx->left) * (rcx->bottom - rcx->top) * 4;
        buf.resize(sz2);
        sz = sz2;
    }

    UINT lBmpRowPitch = lOutputDuplDesc.ModeDesc.Width * 4;
    if (rcx)
        lBmpRowPitch = (rcx->right - rcx->left) * 4;
    UINT lRowPitch = std::min<UINT>(lBmpRowPitch, resource.RowPitch);

    BYTE* sptr = reinterpret_cast<BYTE*>(resource.pData);
    BYTE* dptr = buf.data() + sz - lBmpRowPitch;
    if (rcx)
        sptr += rcx->left * 4;
    for (size_t h = 0; h < lOutputDuplDesc.ModeDesc.Height; ++h)
    {
        if (rcx && h < (size_t)rcx->top)
        {
            sptr += resource.RowPitch;
            continue;
        }
        if (rcx && h >= (size_t)rcx->bottom)
            break;
        memcpy_s(dptr, lBmpRowPitch, sptr, lRowPitch);
        sptr += resource.RowPitch;
        dptr -= lBmpRowPitch;
    }
    context->Unmap(lDestImage, subresource);
    return 1;
}

After that, you can feed the "buf" data into media foundation's sink writer.

Audio Capture

You will use the IAudioClient to get an IAudioCaptureClient to record audio in a separated thread.

void ThreadLoopCapture()
{
    UINT64 up, uq;
    while (Capturing)
    {
        if (hEv)
            WaitForSingleObject(hEv, INFINITE);

        if (!Capturing)
            break;
        auto hr = cap->GetBuffer(&pData, &framesAvailable, &flags, &up, &uq);
        if (FAILED(hr))
            break;
        if (framesAvailable == 0)
            continue;

        auto ThisAudioBytes = framesAvailable * wfx.Format.nChannels * 
                                                wfx.Format.wBitsPerSample/8 ;

        AudioDataX->PushX((const char*)pData, ThisAudioBytes);
        cap->ReleaseBuffer(framesAvailable);
    }
    CapturingFin1 = true;
}

If the recording device is a playback device through loopback, you have to ensure that something is played, otherwise the Core Audio API records nothing. So we have to play silence:

void PlaySilence(REFERENCE_TIME rt)
{
    // ns
    rt /= 10000;
    // in SR , 1000 ms
    //  ?    , rt ms
    auto ns = (wfx.Format.nSamplesPerSec * rt);
    ns /= 1000;
    while (Capturing)
    {
        if (!ren)
            break;

        Sleep((DWORD)(rt / 2));

        if (!Capturing)
            break;

        // See how much buffer space is available.
        UINT32 numFramesPadding = 0;
        auto hr = ac2->GetCurrentPadding(&numFramesPadding);
        if (FAILED(hr))
            break;

        auto numFramesAvailable = ns - numFramesPadding;
        if (!numFramesAvailable)
            continue;

        BYTE* db = 0;
        hr = ren->GetBuffer((UINT32)numFramesAvailable, &db);
        if (FAILED(hr))
            break;
        auto bs = numFramesAvailable * wfx.Format.nChannels * wfx.Format.wBitsPerSample / 8;
        memset(db, 0,(size_t) bs);
        ren->ReleaseBuffer((UINT32)numFramesAvailable, 0); //AUDCLNT_BUFFERFLAGS_SILENT
    }
    CapturingFin2 = true;
}

When there are many audio streams, you have to mix them in a single buffer. This is done using my own REBUFFER and MIXBUFFERs:

struct REBUFFER
{
    std::recursive_mutex m;
    std::vector<char> d;
    AHANDLE Has = CreateEvent(0, TRUE, 0, 0);
    MIXBUFFER<float> mb;

    void FinMix(size_t sz, float* A = 0)
    {
        mb.Fin(sz / sizeof(float), A);
    }

    size_t PushX(const char* dd, size_t sz, float* A = 0, float V = 1.0f)
    {
        REBUFFERLOCK l(m);
        auto s = d.size();
        d.resize(s + sz);
        if (dd)
            memcpy(d.data() + s, dd, sz);
        else
            memset(d.data() + s, 0, sz);

        char* a1 = d.data();
        a1 += s;
        mb.Set((float*)a1);
        mb.count = 1;

        SetEvent(Has);

        float* b = (float*)(d.data() + s);
        if (V > 1.01f || V < 0.99f)
        {
            auto st = sz / sizeof(float);
            for (size_t i = 0; i < st; i++)
                b[i] *= V;
        }
        if (A)
        {
            *A = Peak<float>(b, sz / sizeof(float));
        }

        return s + sz;
    }

    size_t Av()
    {
        REBUFFERLOCK l(m);
        return d.size();
    }

    size_t PopX(char* trg, size_t sz, DWORD wi = 0, bool NR = false)
    {
        if (wi)
            WaitForSingleObject(Has, wi);
        REBUFFERLOCK l(m);
        if (sz >= d.size())
            sz = d.size();
        if (sz == 0)
            return 0;
        if (trg)
            memcpy(trg, d.data(), sz);
        if (NR == false)
            d.erase(d.begin(), d.begin() + sz);
        if (d.size() == 0)
            ResetEvent(Has);
        return sz;
    }

    void Clear()
    {
        REBUFFERLOCK l(m);
        d.clear();
    }
};

If you have audio, video is synchronized to it.

HDR Support

When your display is HDR, the following happens:

lDeskDupl->GetDesc(&lOutputDuplDesc); returns a description with a format of DXGI_FORMAT_R16G16B16A16_FLOAT which is not GDI compatible.
The cursor can't be drawn, hence the Cursor parameter is ignored.
Media Foundation can't create a HDR video. Therefore, you have to install Turbo Play and use my own Media Foundation library which can use NVidia Encoder to create true HDR-10 videos. If you install Turbo Play, in the installation directory, there's a file nvh64.dll. Run regsvr32 with it as administrator and my filter will be registered for you to use with the Screen Capture.

Using the Library

#include "stdafx.h"
#include "capture.hpp"
#include <iostream>

int wmain()
{
    CoInitializeEx(0, COINIT_APARTMENTTHREADED);
    MFStartup(MF_VERSION);
    std::cout << "Capturing screen for 10 seconds...";
    DESKTOPCAPTUREPARAMS dp;
    dp.f = L"capture.mp4";
    dp.EndMS = 10000;
    DesktopCapture(dp);
    std::cout << "Done.\r\n";
    return 0;
}

Where the DESKTOPCAPTUREPARAMS is defined like that:

struct DESKTOPCAPTUREPARAMS
{
    bool HasVideo = 1;
    bool HasAudio = 1;
    std::vector<std::tuple<std::wstring, std::vector<int>>> AudioFrom;
    GUID VIDEO_ENCODING_FORMAT = MFVideoFormat_H264;
    GUID AUDIO_ENCODING_FORMAT = MFAudioFormat_MP3;
    std::wstring f;
    void* cb = 0;
    std::function<HRESULT(const BYTE* d, size_t sz,void* cb)> Streamer;
    std::function<HRESULT(const BYTE* d, size_t sz,void* cb)> Framer;
    std::function<void(IMFAttributes* a)> PrepareAttributes;
    int fps = 25;
    int NumThreads = 0;
    int Qu = -1;
    int vbrm = 0;
    int vbrq = 0;
    int BR = 4000;
    int NCH = 2;
    int SR = 44100;
    int ABR = 192;
    bool Cursor = true;
    RECT rx = { 0,0,0,0 };
    HWND hWnd = 0;
    IDXGIAdapter1* ad = 0;
    UINT nOutput = 0;

    unsigned long long StartMS = 0; // 0, none
    unsigned long long EndMS = 0; // 0, none
    bool MustEnd = false;
    bool Pause = false;
};

Where:

HasVideo = 1 -> You are capturing video. If this is set, the output file must be an MP4 or an ASF regardless of whether you have audio or not.
HasAudio = 1 -> You are capturing audio. If this is set and you do not have a video, the output file must be an MP3 or FLAC.
AudioFrom = a vector of which audio devices you want to capture. Each element is a tuple of the device unique ID (as returned by the enumeration, see VISTAMIXERS::EnumVistaMixers()) and a vector of the channels you want to record from.

The library can also record from a playback device (like your speakers) in loopback. You can specify multiple sources of recording and the library will mix them all into the final audio stream.

VIDEO_ENCODING_FORMAT -> One of MFVideoFormat_H264, MFVideoFormat_HEVC, MFVideoFormat_VP90, MFVideoFormat_VP80. Use HEVC for HDR.
AUDIO_ENCODING_FORMAT -> One of MFAudioFormat_MP3 or MFAudioFormat_FLAC or MFAudioFormat_AAC . MP3 and AAC support only 44100/48000 2 channel output.
f -> target file name (MP3/FLAC for audio only, MP4/ASF else)
fps -> Frames per second
NumThreads -> Threads for the video encoder, 0 default. Can be 0-16.
Qu -> If >= 0 and <= 0, Quality Vs Speed video factor
vbrm and vbrq -> If 2, then vbrq is a quality value between 0 and 100 (BR is ignored)
BR -> Video bitrate in KBps, default 4000. If vbrm is 2, BR is ignored
NCH -> Audio output channels
SR -> Audio output sample rate
ABR -> Audio bitrate in Kbps for MP3
Cursor -> true to capture the cursor. Ignored if HDR.
rx -> If not {0}, capture this specific rect only
hWnd -> If not {0}, capture this HWND only. If HWND is 0 and rx = {0}, the entire screen is captured
ad -> If not 0, specifies which adapter you want to capture if you have more than 1 adapter
nOutput -> The index of the monitor to capture. 0 is the first monitor. For multiple monitors, this specifies the monitor.
EndMS -> If not 0, the library stops when EndMs milliseconds have been captured. Else, you have to stop the library by setting "MustEnd" to true.
MustEnd -> Set to true for the library to stop capturing
Pause -> If true, capture is paused

If you want to capture to a buffer, you must leave the "f" parameter empty and use the Streamer parameter. This calls your callback as long as you return S_OK. If you use an ASF container, then you need not do anything. If you want to use an MP4 stream, then you must prepare the streaming sample description (see this post). You can use this to stream your desktop over HTTP.

Capturing Frames

Instead of capturing compressed video, you may use the 'Framer' callback. This returns a raw RGBA upside down array of your requested resolution as long as you return S_FALSE. (or a half-float 64-bit array if HDR). Once you return S_OK, then the function returns.

History

20^th March, 2024: Added HDR support
2^nd April, 2021: Capturing to stream, capturing frames
18^th January, 2020: First release